There is a code, it works like this, first it gets all the links, and then it starts parsing, how to make it take 1 link and start parsing?

K

kopelev20002019-02-14 17:48:01

Python

kopelev2000, 2019-02-14 17:48:01

There is a code, it works like this, first it gets all the links, and then it starts parsing, how to make it take 1 link and start parsing? If the account has 1000 pages to go to, it will spend a lot of time filling in the "urls" list, and only then it will start scraping off the information, how can it be redone?

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import *
import time
import random

f = open('udemy_titles', 'a', encoding='utf8')
file = open("input_1.txt", 'r')
login_pass_dict = dict()
for x in [line.strip().split(':') for line in file]:
    login_pass_dict.update({x[0]: x[1]})
for login, password in login_pass_dict.items():

    f.write("\n" + "USERNAME: ")
    f.write(login + "\n")
    f.write("PASSWORD: ")
    f.write(password + "\n")

    myProxy = random.choice(open('proxy.txt').readlines())
    Proxy_list = Proxy({
        'proxyType': ProxyType.MANUAL,
        'httpProxy': myProxy,
        'httpsProxy': myProxy,
        'ftpProxy': myProxy,
        'sslProxy': myProxy,
        'noProxy': ''
        })
    print(myProxy + "\n")

    driver= webdriver.Firefox(proxy=Proxy_list)
    driver.get('https://www.udemy.com')
    print("GOT URL\n")
    time.sleep(5)

    driver.find_element_by_xpath("//button[@data-purpose='header-login']").click()
    print("OPEN LOGIN FORM\n")
    time.sleep(5)

    webdriver.ActionChains(driver).move_by_offset(570, 295).click().send_keys(login).perform()
    print("PRINT MAIL\n")
    time.sleep(5)

    webdriver.ActionChains(driver).move_by_offset(100, 65).click().send_keys(password).perform()
    print("PRINT PASSWORD\n")
    time.sleep(5)

    try:
        webdriver.ActionChains(driver).move_by_offset(0, 60).click().perform()
        print("AUTORIZATION\n")
        time.sleep(6)

        driver.find_element_by_xpath("//a[@data-purpose='my-courses']").click()
        print("GO TO URL\n")
        time.sleep(5)
        try:
            def parse_pagination(driver):
                ul_pagination = driver.find_element_by_css_selector("ul.pagination.pagination-expanded")
                li_pagination = ul_pagination.find_elements_by_css_selector("li")[-2]
                count_page = int(li_pagination.find_element_by_css_selector("a").text)
                urls = []
                for i in range(2, count_page + 1):
                    urls.append("https://www.udemy.com/home/my-courses/learning/?p=" + str(i))
                    time.sleep(2)
                return urls

            def parse_list(driver):
                div_card_wrapper = WebDriverWait(driver, 10).until(
                     EC.visibility_of_element_located((By.CSS_SELECTOR, "div.card-wrapper")))
                div_cards = div_card_wrapper.find_elements_by_css_selector("div.card.card--learning")
                for div_card in div_cards:
                    a = div_card.find_element_by_css_selector("a.card--learning__details > div > strong")
                    name = a.text
                    print(name)
                    f.write(name + "\n")
        except:
            pass

        try:
            div  = driver.find_element_by_xpath("//div[@class='pager-label']").text
            div_1 = div.split(" ")[-2]
            f.write("TOTAL COURSES: " )
            f.write(div_1 + "\n")
        except:
            pass

        try:
            parse_list(driver)
            urls = parse_pagination(driver)
            for url in urls:
                driver.get(url)
                time.sleep(2)
                parse_list(driver)
        except:
            pass

        driver.close()
        time.sleep(2)
    except:
        f.write("LOGIN OR PASSWORD IS INCORRECTLY PROVIDED" + "\n")
        driver.close()
        time.sleep(1)

f.close()
file.close()

Tried to change the functions like this:

def url_parse(driver):
    ul_pagination = driver.find_element_by_css_selector("ul.pagination.pagination-expanded")
    li_pagination = ul_pagination.find_elements_by_css_selector("li")[-2]
    count_page = int(li_pagination.find_element_by_css_selector("a").text)
    for page in range(2, count_page):
        urls = driver.get("https://www.udemy.com/home/my-courses/learning/?p=" + str(page))
        time.sleep(2)
    return urls


def parse_list(driver):
    div_card_wrapper = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div.card-wrapper")))
    div_cards = div_card_wrapper.find_elements_by_css_selector("div.card.card--learning")
    for div_card in div_cards:
        a = div_card.find_element_by_css_selector("a.card--learning__details > div > strong")
        name = a.text
        print(name)

And then call the functions like this:

parse_list(driver)
ul_pagination = driver.find_element_by_css_selector("ul.pagination.pagination-expanded")
li_pagination = ul_pagination.find_elements_by_css_selector("li")[-2]
count_page = int(li_pagination.find_element_by_css_selector("a").text)
for number_page in range(2, count_page):
    parse_list(driver)
    url_parse(driver)

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

Y

Yura Khlyan, 2019-02-14
@kopelev2000

This is where you get the url

urls.append("https://www.udemy.com/home/my-courses/learning/?p=" + str(i))

,
instead of stuffing it into a sheet, run the parser.