How to parse an email using bs4?

I

infirmitive2020-05-18 22:01:50

Python

infirmitive, 2020-05-18 22:01:50

Hello! I want to get the code for 2fa from the letter.

from itertools import chain
from bs4 import BeautifulSoup
import time
import imaplib
import email
import email.message

username, password = input('Введи логин;пароль: ').split(';')

criteria = {
    'FROM':    '[email protected]',
    'SUBJECT': 'Ваш код двухфакторной аутентификации',
}
uid_max = 0


def search_string(uid_max, criteria):
    c = list(map(lambda t: (t[0], '"'+str(t[1])+'"'),
                 criteria.items())) + [('UID', '%d:*' % (uid_max+1))]
    return '(%s)' % ' '.join(chain(*c))


def get_first_text_block(msg):
    type = msg.get_content_maintype()

    if type == 'multipart':
        for part in msg.get_payload():
            if part.get_content_maintype() == 'text':
                return part.get_payload()
    elif type == 'text':
        return msg.get_payload()


server = imaplib.IMAP4_SSL('mail.example.com')
server.login(username, password)
server.select('INBOX')
print('Успешно подключились к ' + username)
result, data = server.uid('search', None, search_string(
    uid_max, criteria).encode('utf-8'))

uids = [int(s) for s in data[0].split()]
if uids:
    uid_max = max(uids)

server.logout()

while 1:
    server = imaplib.IMAP4_SSL('mail.example.com')
    server.login(username, password)
    server.select('INBOX')

    result, data = server.uid('search', None, search_string(
        uid_max, criteria).encode('utf-8'))

    uids = [int(s) for s in data[0].split()]
    for uid in uids:
        if uid > uid_max:
            result, data = server.uid(
                'fetch', str(uid), '(RFC822)')
            msg = email.message_from_bytes(data[0][1])

            uid_max = uid

            text = get_first_text_block(msg)
            soup = BeautifulSoup(text, 'lxml')
            code = soup.find(
                'div', style='font-family: arial,helvetica,sans-serif; mso-line-height-rule: exactly; color: #313131; text-align: center; font-size: 40px; letter-spacing: 15x; line-height: 100px')
            print(code.text)

    server.logout()
time.sleep(1)

When receiving a letter, BS does not want to parse the html code in any way (or I wrote something wrong). The soup
variable print successfully displays the body of the letter in html, but it fails to find the div by the specified style and display its text. Writes NoneType to the html itself looks like this:
print(code.text)

<div style="font-family: arial,helvetica,sans-serif; mso-line-height-rule: exactly; color: #313131; text-align: center; font-size: 40px; letter-spacing: 15px; line-height: 100px">
366634
</div>

Reply

Answer the question

In order to leave comments, you need to log in

2 answer(s)

S

Sergey Karbivnichy, 2020-05-18
@infirmitive

You need to search by attribute:

code = soup.find('div',attrs={'style':'font-family: arial,helvetica,sans-serif; mso-line-height-rule: exactly; color: #313131; text-align: center; font-size: 40px; letter-spacing: 15px; line-height: 100px'}).text.strip()
  print(code)
  >>> 366634

T

TheAngryPython, 2020-05-18
@TheAngryPython

Using Regular Expressions

import re
pattern = re.compile('[^#][<>\n](\d\d\d\d\d\d)[<>\n]')
pattern.findall('<div style="font-family: arial,helvetica,sans-serif; mso-line-height-rule: exactly; color: #313131; text-align: center; font-size: 40px; letter-spacing: 15px; line-height: 100px">366634</div>')
['366634']