#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
## Purpose: scrape data files for the RA Data Project
# http://www.jasmiths.com/
#
## Author: Andrew Smith (jas3@uchicago.edu)
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
import os
import getpass
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException, TimeoutException, NoSuchFrameException, WebDriverException

# we want to create a class that is an object defined as ChromeDriver
class ChromeDriver():
    # an __init__ method takes in all arguments that are needed to feed into the class
    # including the argument self which is used to refer to the instance of the object, ChromeDriver,
    # within the class itself

    def __init__ (self, file_folder, chromedriver_path, files, data_url):
        self.file_folder = file_folder
        self.chromedriver_path = chromedriver_path
        self.files = files
        self.data_url = data_url
        # Setting wait time, search engine in use
        self.waiting = 15
        self.engine = 'chrome'
        # setup() is a constructor function
        # we need to call it in order to clean the folder of files
        # as well as initiate the chromedriver
        self.setup()

    def setup(self):
        os.chdir(self.file_folder)

        # Removing files from directory (if necessary)

        for f in self.files:
            folder = self.file_folder
            filepath = os.path.join(self.file_folder, f)

            if os.path.exists(filepath):
                os.remove(filepath)
        # Setting downloading preferences
        chromeOptions = Options()
        prefs = {"download.default_directory" : self.file_folder}
        chromeOptions.add_experimental_option("prefs",prefs)

        self.driver = webdriver.Chrome(executable_path = self.chromedriver_path, chrome_options = chromeOptions)

    # get_data() is called on any instance of the object itself in order to actually scrape
    # all relevant data and put it into the right folders 
    def get_data(self):
        # Opening
        print('Opening Chrome')
        self.driver.get(self.data_url)

        # Downloading data
        print("Downloading Data Files")
        for i in range(len(self.files)):
            # Clicking on the download link
            WebDriverWait(self.driver,self.waiting).until(EC.element_to_be_clickable((By.XPATH,"""/html/body/main/div/ol/li[{}]/a""".format(i+1)))).click()

            # We wait to perform another task so that we have ample time to download the files
            time.sleep(self.waiting)

The following script instantiates a ChromeDriver object and scrapes within the class given the path and filename arguments specific to your machine and task at hand.

import os
import getpass
from ChromeDriver import ChromeDriver

# the following if statement 
if getpass.getuser()=="nadialucas":
    file_folder = r"/Users/nadialucas/Documents/EPIC/2018/orientation_sessions/txt"
    # copy the path after typing "which chromedriver" into your command-line
    chromedriver_path = r"/Users/nadialucas/anaconda/bin/chromedriver" 
files = ["demographics.txt","house_age.txt","house_chars1.txt","house_chars2.txt","house_type.txt"]
data_url = "http://www.jasmiths.com/data_project"
driverObj = ChromeDriver(file_folder, chromedriver_path, files, data_url)
driverObj.get_data()

Advanced method

results matching ""

No results matching ""