Working with CSV in Python or importing files?

O

ohuevshiy2020-02-26 21:48:14

Python

ohuevshiy, 2020-02-26 21:48:14

I welcome everyone!
I practice on the Avito parser, in fact the first one is the ad parser file, with the subsequent saving of information to a csv file, namely: "ad name", "price", "publication date" and "link".
The second one is the file responsible for parsing a picture with a phone number and then converting it to a string using the tesseract-ocr library.

I imported the second file into the first and the problem is to fasten it there competently and efficiently.
So that the first file, when it reaches the first ad, opens it and also parses the picture with the number, with further translation into text, as well as saving it to the same csv plate.

Tell me how to tie it there, or is there an option to work with csv files so that in the second file I can refer to the csv table, where the links to the ads have already been parsed?

Many thanks in advance to all who respond! Do not throw tomatoes, green is still in this case :)

The first file.

import requests
from bs4 import BeautifulSoup
import csv
from number import Bot




def get_html(url):
  r = requests.get(url)
  return r.text

def get_total_pages(html):
  soup = BeautifulSoup(html, 'lxml')

  pages = soup.find('div', class_='pagination-pages').find_all('a', class_='pagination-page')[-1].get('href')
  total_pages = pages.split('=')[1].split('&')[0]

  return int(total_pages)

def write_csv(data):
  with open('avito.csv', 'a', errors='ignore') as f:
    writer = csv.writer(f, delimiter=';')
    
    writer.writerow( (data['title'], 
                    data['price'], 
                    data['published'], 
                    data['url']))



def get_page_data(html):
  soup = BeautifulSoup(html, 'lxml')

  ads = soup.find('div', class_='js-catalog_serp').find_all('div', class_='item_table')
  for ad in ads:

    name = ad.find('div', class_='description').find('h3').text.strip().lower()

    if 'бытовка' in name:


      try:
        title = ad.find('div', class_='description').find('h3').text.strip()
        #print(title)

      except:
        title = ''
        

      try:
        url = 'https://www.avito.ru' + ad.find('div', class_='description').find('h3').find('a').get('href').strip()
        print(url)
                      

      except:
        url = ''

      try:
        price = ad.find('div', class_='about').text.strip()
        print(price)
      
      except:
        price = ''

      try:
        published = ad.find('div', class_='item-date').text.strip()
        print(published)

      except:
        published = ''


      data = {'title': title, 
              'price': price, 
              'published': published,
              'url': url}
      
      
      write_csv(data)
      
    else:
      continue	

def main():
  url = "https://www.avito.ru/irkutsk?q=%D0%B2%D0%B0%D0%B3%D0%BE%D0%BD%D1%87%D0%B8%D0%BA+%D0%B1%D1%8B%D1%82%D0%BE%D0%B2%D0%BA%D0%B0&p=1"
  base_url = 'https://www.avito.ru/irkutsk?'
  category_part = 'q=%D0%B2%D0%B0%D0%B3%D0%BE%D0%BD%D1%87%D0%B8%D0%BA+%D0%B1%D1%8B%D1%82%D0%BE%D0%B2%D0%BA%D0%B0&'
  page_part = 'p='

  total_pages = get_total_pages(get_html(url))

  for i in range(1, total_pages + 1):
    url_gen = base_url + category_part + page_part + str(i)
    print(url_gen)
    html = get_html(url_gen)
    get_page_data(html)
    
if __name__ == '__main__':
  main()

Second file.

from selenium import webdriver
from time import sleep
from PIL import Image
from pytesseract import image_to_string

class Bot(object):
  def __init__(self):
    self.driver = webdriver.Firefox()
    self.navigate()

  def take_screenshot(self):
    self.driver.save_screenshot('avito_screenshot.png')


  def vagon_recon(self):
    image = Image.open('vagon.gif')
    print(image_to_string(image))


  def crop(self, location, size):
    image = Image.open('avito_screenshot.png')
    x = location['x']
    y = location['y']
    width = size['width']
    height = size['height']


    image.crop((x, y, x + width, y + height)).save('vagon.gif')
    self.vagon_recon()


  def navigate(self):
    self.driver.get('')

    button = self.driver.find_element_by_xpath('//button[@class="button-button-2Fo5k button-size-l-3LVJf button-primary-1RhOG width-width-12-2VZLz"]')
    button.click()

    sleep(3)

    self.take_screenshot()

    image = self.driver.find_element_by_class_name('contacts-phone-3KtSI')
    location = image.location
    size = image.size

    self.crop(location, size)

def main():
  b = Bot()

if __name__ == '__main__':
  main()