Answer the question
In order to leave comments, you need to log in
How to pack characteristics by columns in CSV using Scrapy?
Can you please tell me how to package characteristics by columns in CSV in Scrapy?
All characteristics are collected in one multiple field.
def parse(self, response):
yield {
'name': response.xpath('//h1/text()').extract(),
'har': response.xpath('//tr[contains(@class,"woocommerce-product-attributes-item")]').getall(),
'url': response.url,
}
<table class="woocommerce-product-attributes shop_attributes">
<tbody><tr class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_strana-proizvodstva">
<th class="woocommerce-product-attributes-item__label">Страна производства</th>
<td class="woocommerce-product-attributes-item__value"><p>Канада</p>
</td>
</tr>
<tr class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_obyom">
<th class="woocommerce-product-attributes-item__label">Объём</th>
<td class="woocommerce-product-attributes-item__value"><p><a href="https://ongip.ru/product-attribute/obyom/240-ml/" rel="tag">240 мл</a></p>
</td>
</tr>
<tr class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_%d0%bf%d0%b8%d1%89%d0%b5%d0%b2%d0%b0%d1%8f-%d1%86%d0%b5%d0%bd%d0%bd%d0%be%d1%81%d1%82%d1%8c">
<th class="woocommerce-product-attributes-item__label">Пищевая ценность</th>
<td class="woocommerce-product-attributes-item__value"><p>Attitude Канада</p>
</td>
</tr>
<tr class="woocommerce-product-attributes-item woocommerce-product-attributes-item--attribute_pa_seriya">
<th class="woocommerce-product-attributes-item__label">Серия</th>
<td class="woocommerce-product-attributes-item__value"><p><a href="https://ongip.ru/product-attribute/seriya/furry-friends/" rel="tag">Furry Friends</a></p>
</td>
</tr>
</tbody></table>
Answer the question
In order to leave comments, you need to log in
they don’t do it like that, first to the database, usually noskl, then from there to csv
the data may not be consistent, it’s silly to file right away,
but if you really want
scrapingauthority.com/2016/09/19/scrapy-exporting-...
https:/ /stackoverflow.com/questions/20719263/write...
So you need a script that will check for consistency.
I have a similar script, but it's in C#
using System;
using System.Collections.Generic;
using System.Text;
using System.Data;
using DynamicPluginData;
using System.Text.RegularExpressions;
using System.Reflection;
using System.Collections.Specialized;
using System.IO;
using LowLevel;
using System.Net;
using System.Threading;
using System.Collections;
using HtmlAgilityPack;
namespace DatacolDynamicPluginNS
{
public class DynamicPluginClass
{
public static DataTable preExportData(DataTable dataTable, ItemInfo itemInfo, GlobalInfo globalInfo)
{
//Название поля данных, в которое собрана таблица характеристик
string FieldName = "Характеристики";
//Список Xpath выражений для сохранения области, в которой находится название и значение одной характеристики
List<string> ParameterAreaXpathList = new List<string> {"//li"};
//Список Xpath выражений для сохранения названия характеристики из вырезанной области
List<string> ParameterNameXpathList = new List<string> {"//div[1]"};
//Список Xpath выражений для сохранения значения характеристики из вырезанной области
List<string> ParameterValueXpathList = new List<string> {"//div[2]"};
//Список регулярных выражений для сохранения области, в которой находится название и значение одной характеристики
List<string> ParameterAreaRegexList = new List<string> ();//{"<dl[^<>]*?product-spec[^<>]*?>.*?</dl>"}
//Список регулярных выражений для сохранения названия характеристики из вырезанной области
List<string> ParameterNameRegexList = new List<string> ();//{"<span[^<>]*?product-spec__name-inner[^<>]*?>(.*?)<.*?>"}
//Список регулярных выражений для сохранения значения характеристики из вырезанной области
List<string> ParameterValueRegexList = new List<string> ();//{"<span[^<>]*?product-spec__value-inner[^<>]*?>(.*?)<.*?>"}
bool AreasFound = false;
bool TitlesFound = false;
DataTable tempDT = new DataTable();
for (int i = 0; i < dataTable.Rows.Count; i++)
{
List<string> Areas = getMatches(dataTable.Rows[i][FieldName].ToString(), ParameterAreaXpathList, ParameterAreaRegexList);
if (Areas.Count == 0) continue;
AreasFound = true;
foreach (string Area in Areas)
{
string Title = getMatch(Area, ParameterNameXpathList, ParameterNameRegexList, 1, true);
if (Title == "") continue;
TitlesFound = true;
if (!dataTable.Columns.Contains(Title))
{
dataTable.Columns.Add(Title);
}
}
}
if (!AreasFound) throw new Exception("Области с характеристиками не найдены");
if (!TitlesFound) throw new Exception("Названия характеристик не найдены");
DataTable tempDT2 = new DataTable();
bool ValuesFound = false;
for (int i = 0; i < dataTable.Rows.Count; i++)
{
#region Get characteristics
List<string> Areas = getMatches(dataTable.Rows[i][FieldName].ToString(), ParameterAreaXpathList, ParameterAreaRegexList);
if (Areas.Count == 0) continue;
foreach (string Area in Areas)
{
string Title = getMatch(Area, ParameterNameXpathList, ParameterNameRegexList, 1, true);
if (Title == "") continue;
string Value = getMatch(Area, ParameterValueXpathList, ParameterValueRegexList, 1, true);
if (String.IsNullOrEmpty(Value)) continue;
ValuesFound = true;
dataTable.Rows[i][Title] = Value;
}
#endregion
}
if (!ValuesFound) throw new Exception("Значения характеристик не найдены");
return dataTable;
}
/// <summary>
/// Получаем список участков
/// </summary>
/// <param name="DataTable"></param>
/// <param name="ParameterAreaXpathList"></param>
/// <param name="ParameterAreaRegexList"></param>
public static List<string> getMatches(string RawParameterSource, List<string> ParameterAreaXpathList, List<string> ParameterAreaRegexList, int RegexGroup = 0, bool StripTags = false)
{
List<string> RetVal = new List<string>();
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(RawParameterSource);
HtmlDocument docin = new HtmlDocument();
//Поиск по xpath
foreach (string xpath in ParameterAreaXpathList)
{
if (doc.DocumentNode.SelectNodes(xpath) == null) continue;
foreach (HtmlNode node in doc.DocumentNode.SelectNodes(xpath))
{
RetVal.Add(node.OuterHtml);
}
}
//Поиск по regex
foreach (string regex in ParameterAreaRegexList)
{
MatchCollection matches = Regex.Matches(RawParameterSource, regex, RegexOptions.Singleline | RegexOptions.IgnoreCase);
foreach (Match match in matches)
{
RetVal.Add(match.Groups[RegexGroup].ToString());
}
}
if (StripTags)
{
for (int i = 0; i < RetVal.Count; i++)
{
RetVal[i] = extra.strip_tags(RetVal[i]).Trim();
}
}
return RetVal;
}
public static string getMatch(string RawParameterSource, List<string> ParameterAreaXpathList, List<string> ParameterAreaRegexList, int RegexGroup = 0, bool StripTags = false)
{
List<string> RetValList = getMatches(RawParameterSource, ParameterAreaXpathList, ParameterAreaRegexList,
RegexGroup,
StripTags);
if (RetValList.Count == 0) return "";
return RetValList[0];
}
}
}
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question