Who will help to deal with the parser ???

K

KolenBG2022-04-17 15:30:05

Python

KolenBG, 2022-04-17 15:30:05

import requests
from bs4 import BeautifulSoup

URL = ' https://auto.drom.ru/region25/audi/ '

HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ( KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 OPR/85.0.4341.60',
'accept': '*/*',
}

HOST = ' https://auto.drom.ru '

def get_html(url, params =None):
r = requests.get(url, headers=HEADERS, params=params)
return r

def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('a', {'class': "css-1ctbluq ewrty961"})

cars = []
for item in items:
cars.append({
'title': item.find('span', {'data-ftid': 'bull_title'}).get_text(),
'objec': item.find('div', {'class' : 'css-188tlrp e162wx9x0'}).get_text(),

'price': item.find('span', {'data-ftid': 'bull_price'}).text
})

print(cars)
print(len( cars))

def parce():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('ERROR')

parce()

[{'title': 'Audi Q7, 2017 ', 'objec': '2.0 l (252 hp), gasoline, 4WD, 67 thousand km ', 'price': '4\xa0000\xa0000\xa0'},

The essence of the question is how to get rid of \xa ???
I've tried different methods and it doesn't work at all.

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

S

Sergey Gornostaev, 2022-04-17
@sergey-gornostaev

item.find('span', {'data-ftid': 'bull_price'}).text.replace('\xa0', '')

But it is desirable to add a check that the BS found the required tag and returned the string.