Answer the question
In order to leave comments, you need to log in
Why does the parser work so weird when parallelized?
The situation is this.
I am writing a parser that I set on the site decathlon.ru (specifically, on the "Sports" section). The essence of the parser is simple - we bypass addresses from a given list (these are product catalogs), read in the catalogs what sport they belong to, what category and subcategory of products are presented, take links to the products themselves and parse the page of each product, pulling out its name and price from there .
I do the parsing myself using the AngleSharp library.
Here is the code of the method, where a separate product catalog is bypassed:
private static readonly IConfiguration _config = Configuration.Default.WithDefaultLoader();
static async Task<IList<Product>> ParseProductsCatalogPageAsync(string catalogUrl, string filenameOutput,
bool isAppend)
{
var products = new List<Product>();
var document = await BrowsingContext.New(_config).OpenAsync(catalogUrl);
string selBreadcrumbText = ".m-breadcrumbs ul[data-breadcrumb-size='5'] li a span.text";
var breadcrumbElements = document.QuerySelectorAll(selBreadcrumbText).ToArray();
string meansOfSport = breadcrumbElements[0].TextContent;
string productsCategory = breadcrumbElements[1].TextContent;
string selProductsSubcategory = "span.link_breadcrumb.link_breadcrumb_no_child_last";
var elemProductsSubcategory = document.QuerySelector(selProductsSubcategory);
string productsSubcategory = elemProductsSubcategory.TextContent.Trim();
int qProducts = Convert.ToInt32(document.QuerySelector("span.nb-products").TextContent);
if (qProducts > 0)
{
int numPage = 1;
int id = 1;
string rootLink = "https://www.decathlon.ru";
if (qProducts > 40)
{
while (true)
{
document =
await BrowsingContext.New(_config)
.OpenAsync(catalogUrl + "/I-Page" + numPage.ToString() + "_40");
string selProductLink = "a.thumbnail-link";
var productElements = document.QuerySelectorAll(selProductLink).AsEnumerable();
var currDocument = document;
Parallel.ForEach(productElements, (elem) =>
{
string link = rootLink + elem.GetAttribute("href");
var tskProduct = ParseProductPageAsync(link);
var product = tskProduct.Result;
product.Id = id;
product.SportsName = meansOfSport;
product.Category = productsCategory;
product.Subcategory = productsSubcategory;
id++;
if (Math.Abs(product.Price) < 0.001)
{
string selProductPrice = selProductLink + "[href='" + elem.GetAttribute("href") +
"'] " +
"div.zone-price-selling-price div.price";
string trimmedMinPriceString = currDocument.QuerySelector(selProductPrice)
.TextContent
.Trim()
.Replace(" ", "").Replace(" ", "");
string minPriceString = trimmedMinPriceString.Substring(0,
trimmedMinPriceString.Length - "руб.".Length).Replace(" ", "");
product.Price = Convert.ToDouble(minPriceString);
}
products.Add(product);
});
if (products.Count == qProducts)
{
break;
}
numPage++;
}
}
else
{
string selProductLink = "a.thumbnail-link";
var productElements = document.QuerySelectorAll(selProductLink).AsEnumerable();
var currDocument = document;
Parallel.ForEach(productElements, (elem) =>
{
string link = rootLink + elem.GetAttribute("href");
try
{
var tskProduct = ParseProductPageAsync(link);
var product = tskProduct.Result;
product.Id = id;
product.SportsName = meansOfSport;
product.Category = productsCategory;
product.Subcategory = productsSubcategory;
id++;
if (Math.Abs(product.Price) < 0.001)
{
string selProductPrice = selProductLink + "[href='" + elem.GetAttribute("href") +
"'] " +
"div.zone-price-selling-price div.price";
string trimmedMinPriceString = currDocument.QuerySelector(selProductPrice)
.TextContent
.Trim()
.Replace(" ", "").Replace(" ", "");
string minPriceString = trimmedMinPriceString.Substring(0,
trimmedMinPriceString.Length - "руб.".Length).Replace(" ", "");
product.Price = Convert.ToDouble(minPriceString);
}
products.Add(product);
}
catch (Exception)
{
Debug.WriteLine(link);
}
});
}
}
SaveProductsDataToFile(filenameOutput, products, isAppend);
Console.WriteLine("Обработана категория {0} > {1} > {2}", meansOfSport, productsCategory,
productsSubcategory);
return products;
}
static async Task<Product> ParseProductPageAsync(string productUrl)
{
var document = await BrowsingContext.New(_config).OpenAsync(productUrl);
string selArticle = "div.ref-product";
string selProductName = "span#productName";
string selPrice = "span#real_price_value";
var elemArticle = document.QuerySelector(selArticle);
var elemProductName = document.QuerySelector(selProductName);
var elemPrice = document.QuerySelector(selPrice);
string fullArticleString = elemArticle.TextContent.Replace("\n", "").Replace("\t", "");
string articleStringName = "Артикул : ";
string fullPriceString = elemPrice == null ? "0руб." : elemPrice.TextContent;
string priceOnlyString = fullPriceString.Substring(0, fullPriceString.Length - "руб.".Length);
long article = Convert.ToInt64(fullArticleString.Substring(articleStringName.Length));
string productName = elemProductName.TextContent;
double price = Convert.ToDouble(priceOnlyString);
var product = new Product
{
Article = article,
Name = productName,
Price = price
};
return product;
}
Answer the question
In order to leave comments, you need to log in
List is not thread-safe collection, try to use ConcurrentBag instead
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question