T
T
TANK_IST2019-12-12 21:28:53
JavaScript
TANK_IST, 2019-12-12 21:28:53

How to run parallel page parsing on node.js?

There will be many sites, it is necessary that the pages are loaded in parallel and the result is displayed in one array.
Wrote this code:
index.js

const puppeteer = require('puppeteer');
const yargs = require('yargs');
const sites = [];
sites['amazon'] = require('./amazon.js');
sites['ebay'] = require('./amazon.js');
const data = [];


puppeteer.launch({
    headless: true,
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"']
}).then(async browser => {
    for (i in sites) {
        data.push( await sites[i].parse(browser));
    }

    console.log(data);

    await browser.close();
}).catch(function (error) {
    console.error(error);
});

amazon.js
const yargs = require('yargs');

let parse = async (browser) => {
    const page = await browser.newPage();
    await page.setJavaScriptEnabled(false);
    await page.setRequestInterception(true);
    page.on('request', (request) => {
        if (['image', 'stylesheet', 'font', 'script'].indexOf(request.resourceType()) !== -1) {
            request.abort();
        } else {
            request.continue();
        }
    });
    await page.goto("https://www.amazon.com/s?k="+ yargs.argv.search +"&rh=p_n_condition-type%3ANew");
    await page.waitForSelector('body');

    let products = await page.evaluate(() => {
        let divs = document.body.querySelectorAll('.s-search-results>div[data-asin]');
        let items = [];

        divs.forEach(item => {
            let price = item.querySelector('.a-price:not([data-a-strike]) .a-offscreen');
            let priceOld = item.querySelector('.a-price[data-a-strike] .a-offscreen');

            price = price ? price.innerText : null;
            priceOld = priceOld ? priceOld.innerText : null;

            items.push({
                asin: item.getAttribute('data-asin'),
                price: price,
                priceOld: priceOld,
                title: item.querySelector('h2').innerText,
                img: item.querySelector('img').getAttribute('src')
            });
        });

        return items;
    });

    return products;
};

module.exports = {
    parse
};

Now it works, but it is synchronous.
I run the node via php.
Thank you!

Answer the question

In order to leave comments, you need to log in

1 answer(s)
I
IDONTSUDO, 2019-12-12
@IDONTSUDO

- it is necessary that the pages are loaded in parallel and the result is displayed in one array.

This means you want synchronous code. which you can control.
This is a pretty good explanation of how a node works.
But you can also run node cluster . For each page that needs to be parsed, make its own separate stream. And put all the data from the requested pages into the database. And then die it all. But this is such a thing, since performance will depend on how many cores you have in the processor.
If you want to do so. See the documentation for Event: 'message'.

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question