A
A
Alexey Yarkov2016-01-09 22:36:53
Python
Alexey Yarkov, 2016-01-09 22:36:53

How to execute an action after yield?

# -*- coding: utf-8

import MySQLdb
from grab.spider import Spider, Task
import logging
import time
import re
import urllib2

def _ru(s="", e='utf-8'):
  try:
    s = s.encode(e)
  except:
    return s
  return s

class BashImSpider(Spider):

  host = "localhost"
  user = "root"
  passwd = ""
  base = "bashorg"
  charset = 'utf8'
  initurl = 'http://bash.im/'
  user_agent = (
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0"
  )
  currentPage = 1

  root = '//div[@class="quote"]'
  id = './/a[@class="id"]'
  cite = './/div[@class="text"]'

  def __init__(self):
    super(self.__class__, self).__init__()
    self.db = MySQLdb.connect(
                  host=self.host, 
                  user=self.user, 
                  passwd=self.passwd, 
                  db=self.base, 
                  charset=self.charset
                )
    self.cursor = self.db.cursor()
    self.maxPage = self.getMaxPage()

  def task_generator(self):
    """Проходим в цикле по всем страницам и генерируем задачи для парсера Spider"""
    # здесь цикл по количеству страниц, а после цикла закрыть соединение с БД
    while self.currentPage < self.maxPage:
      url = self.getUrl(self.currentPage)
      self.currentPage = self.currentPage + 1
      yield Task('search', url=url)

    # А ТУТ НАДО КАК-ТО ЗАКРЫТЬ КОННЕКТ К БД, НО ЭТО ДЕЙСТВИЕ НЕ ВЫПОЛНЯЕТСЯ!!!
    self.db.close()

  def getUrl(self, page):
    """Вернет URL страницы в формате http://bash.im/index/%номер страницы%"""
    return  "%sindex/%s" % (self.initurl, str(page))

  def getMaxPage(self):
    """Вернет число страниц для построения цикла"""
    req = urllib2.Request(
      url=self.initurl,
      headers={"User-Agent": self.user_agent}
    )
    page = urllib2.urlopen(req)
    data = page.read()
    result = re.search(r'max=\"([0-9]+)\"', _ru(data.decode('windows-1251')))
    return int(result.group(1))

  def writeCitate(self, sqlData = ()):
    """Заносим цитаты в БД"""
    sql = "INSERT INTO bash (quote_id, quote_text) VALUES ({quote_id}, '{quote_text}')".format(
                                                    quote_id = str(sqlData[0]), 
                                                    quote_text = str(sqlData[1])
                                                  )
    self.cursor.execute(sql)
    self.db.commit()

  def task_search(self, grab, task):
    """Собираем все цитаты со страницы"""
    for elem in grab.doc.select(self.root):
      time.sleep(1)
      try:
        id = elem.select(self.id)
        cite = elem.select(self.cite)
        sqlData = (
            id.number(),
            _ru(cite.html()
              .replace( '<div class="text">', '' )
              .replace( '</div>', '' ))
            )
        self.writeCitate(sqlData)
      except:
        continue


logging.basicConfig(level=logging.DEBUG)
bashParser = BashImSpider()
bashParser.run()

How to make the self.db.close() method still called?

Answer the question

In order to leave comments, you need to log in

1 answer(s)
A
abs0lut, 2016-01-09
@yarkov

Use the with context manager

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question