Answer the question
In order to leave comments, you need to log in
When collecting data from hh.ru, not all links are printed?
Good day!
I am writing a hh.ru parser for the vacancy section in Russia. There are about 50 ads on one page, but when I collect all the links from the page and print, only 20 links out of 50 are displayed. I searched for links in different ways using the find () and find_all () methods.
Can you please tell me why this is so, am I doing something wrong? Do not judge strictly, I'm only learning Python for the second week. Here is the script
import requests
from bs4 import BeautifulSoup as bs
import fake_useragent
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': fake_useragent.UserAgent().random
}
def get_data(url):
session = requests.Session()
response = session.get(url, headers=headers)
soup = bs(response.text, 'lxml')
try:
page_count = int(soup.find('div', class_='pager').find_all('span', recursive=False)[-1].find('a').find('span').text)
except:
return
# for page in range(0, page_count):
for page in range(0, 1):
response = session.get(url='https://hh.ru/search/vacancy?clusters=true&ored_clusters=true&enable_snippets=true&salary=&text=%D0%A1%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%BD%D1%8B%D0%B9+%D0%B0%D0%B4%D0%BC%D0%B8%D0%BD%D0%B8%D1%81%D1%82%D1%80%D0%B0%D1%82%D0%BE%D1%80&page={page}&customDomain=1', headers=headers)
soup = bs(response.text, 'lxml')
links = soup.find_all('a', class_='bloko-link', attrs={ 'data-qa': 'vacancy-serp__vacancy-title', 'target': '_blank' })
for a in links:
link = a.get('href').split('?')[0]
print(link)
def main():
get_data(
'https://hh.ru/search/vacancy?clusters=true&ored_clusters=true&enable_snippets=true&salary=&text=%D0%A1%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%BD%D1%8B%D0%B9+%D0%B0%D0%B4%D0%BC%D0%B8%D0%BD%D0%B8%D1%81%D1%82%D1%80%D0%B0%D1%82%D0%BE%D1%80&page=0&customDomain=1'
)
if __name__ == '__main__':
main()
Answer the question
In order to leave comments, you need to log in
Initially, all vacancies are transferred to the template tag, after which they are already built by javascript into the body of the page. requests doesn't execute JS, so the code stays in that tag. Find the tag with id in the source code of the page HH-Lux-InitialState
, the usual JSON will be there.
Well, actually, parse it through BS, load JSON and look for what you need
for page in range(0, 1):
response = session.get(url='https://hh.ru/search/vacancy?clusters=true&ored_clusters=true&enable_snippets=true&salary=&text=%D0%A1%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%BD%D1%8B%D0%B9+%D0%B0%D0%B4%D0%BC%D0%B8%D0%BD%D0%B8%D1%81%D1%82%D1%80%D0%B0%D1%82%D0%BE%D1%80&page={page}&customDomain=1', headers=headers)
soup = bs(response.text, 'html.parser')
data = json.load(soup.find('template', attrs={'id': 'HH-Lux-InitialState'}).text)
for vacancy in data['vacancySearchResult']['vacancies']:
print(vacancy['name'])
{
"@responseLetterRequired": false,
"@showContact": false,
"vacancyId": 54578729,
"name": "Системный администратор в Информационный центр по атомной энергии Челябинска",
"company": {
"@showSimilarVacancies": true,
"@trusted": true,
"@category": "COMPANY",
"@countryId": 1,
"@state": "APPROVED",
"id": 1339904,
"name": "Информационный центр атомной отрасли",
"visibleName": "Информационный центр атомной отрасли"
},
"compensation": {
"from": 40000,
"currencyCode": "RUR",
"gross": false
},
"publicationTime": {
"@timestamp": 1649656472,
"$": "2022-04-11T08:54:32.258+03:00"
},
"type": "open",
"area": {
"@id": 104,
"name": "Челябинск",
"path": ".113.223.1384.104."
},
"acceptTemporary": false,
"address": {
"@id": 7055117,
"@disabled": false,
"city": "Челябинск",
"street": "улица Энгельса",
"building": "107",
"displayName": "Челябинск, улица Энгельса, 107",
"mapData": "{\"points\":{\"center\":{\"lat\":55.147782999978,\"lng\":61.381102000000034,\"zoom\":21},\"marker\":{\"lat\":55.147783,\"lng\":61.381102}}}",
"marker": {
"@lat": 55.147783,
"@lng": 61.381102
},
"manager": {
"@id": 7315715
}
},
"metallic": "standard_plus",
"creationSite": "voronezh.hh.ru",
"creationSiteId": 15,
"displayHost": "hh.ru",
"lastChangeTime": {
"@timestamp": 1649659594,
"$": "2022-04-11T09:46:34.624+03:00"
},
"creationTime": "2022-04-05T08:54:32.258+03:00",
"canBeShared": true,
"employerManager": {
"@hhid": 113655137,
"@managerId": 7943574,
"@userId": 91458223,
"latestActivity": "online"
},
"inboxPossibility": true,
"chatWritePossibility": "ENABLED_AFTER_INVITATION",
"notify": true,
"links": {
"desktop": "https://hh.ru/vacancy/54578729",
"mobile": "https://m.hh.ru/vacancy/54578729"
},
"workSchedule": "FULL_DAY",
"acceptIncompleteResumes": false,
"driverLicenseTypes": [
{}
],
"languages": [
{}
],
"workingDays": [
{}
],
"workingTimeIntervals": [
{}
],
"workingTimeModes": [
{}
],
"hrBotScenario": {
"hrBotScenarioId": 1552425,
"enabled": false,
"scenarioCode": "vacancy_scenario_1552425"
},
"vacancyProperties": {
"properties": [
{}
]
},
"userLabels": [],
"snippet": {
"req": "Высшее или среднее образование (IT). Знание и любовь к «железу». Стрессоустойчивость и коммуникабельность (работа подразумевает частый контакт с людьми разных...",
"resp": "Обслуживание компьютерной техники в офисе компании (компьютеры, роутеры, принтеры и т.д.). Работа с мультимедийной системой (показ фильмов, презентаций). ",
"cond": "Полный рабочий день (возможна работа в выходные и вечерние часы, предоставляются отгулы). Официальное трудоустройство. Заработная плата 40 тысяч на руки. ",
"skill": "Подготовка презентаций, Техническое обслуживание, Работа в команде",
"desc": null
},
"responsesCount": 29
}
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question