Answer the question
In order to leave comments, you need to log in
Where can I find the script (parser/grabber) of rss feeds?
Salut, please tell me a script, or an engine capable of parsing the required list of sites and robbing their latest news, or rather headlines and links to them? Something like Google but easier :).
If this is written individually, then please tell me the lessons or advise how to implement it, (Googled, but didn’t find anything suitable) thanks!
Answer the question
In order to leave comments, you need to log in
There is one craft, the description can be viewed here
The site working on it is here
If you are interested, write, in extreme cases we will finish it for specific needs or set up and instruct
Shit code from an old project
<?php
require_once('parserutils.class.php');
class RSSParser implements Iterator {
private $position = 0;
private $rss = [];
public function __construct($rss) {
$this->position = 0;
$tmp = ParserUtils::normalizeXML($rss->channel);
if(!empty($tmp['item'])){
$this->rss = $tmp['item'];
}
}
function rewind() {
$this->position = 0;
}
function current() {
$c = $this->rss[$this->position];
return ParserUtils::constructRssItem(
$c->title,
$c->link,
$c->description,
$c->pubDate
);
}
function key() {
return $this->position;
}
function next() {
++$this->position;
}
function valid() {
return isset($this->rss[$this->position]);
}
public static function check($rss){
return !(empty($rss) || empty($rss->channel) || empty($rss->channel->item));
}
}
?>
<?php
class AtomParser implements Iterator {
private $position = 0;
private $rss = [];
public function __construct($rss) {
$this->position = 0;
$tmp = ParserUtils::normalizeXML($rss);
if(!empty($tmp['entry'])){
$this->rss = $tmp['entry'];
}
}
function rewind() {
$this->position = 0;
}
function current() {
$c = $this->rss[$this->position];
$lastLink = null;
foreach ($c->link as $vl) {
$lastLink = $vl;
if($vl['type'] == "text/html"){
break;
}
}
return ParserUtils::constructRssItem(
$c->title,
$lastLink["href"],
$c->content,
$c->updated
);
}
function key() {
return $this->position;
}
function next() {
++$this->position;
}
function valid() {
return isset($this->rss[$this->position]);
}
public static function check($rss){
return !(empty($rss) || empty($rss->entry));
}
}
?>
<?php
require_once('RollingCurl.php');
class ParserUtils
{
final private function __construct() {}
final private function __clone() {}
static $curlOpt = [CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17"];
public static function normalizeXML ($xmlObject)
{
$res = [];
foreach ( (array) $xmlObject as $index => $node ){
$res[$index] = ( is_object ( $node ) ) ? self::normalizeXML ( $node ) : $node;
}
return $res;
}
public static function constructRssItem($title, $link, $content, $update){
$content = preg_replace('/\[crayon-.+\]/U','',trim($content));
$title = trim(strip_tags($title));
$update = date('Y-m-d H:i:s',strtotime($update));
return $res = [
'content'=>$content,
'update'=>$update,
'title'=>$title,
'link'=>trim(strip_tags($link))
];
}
public static function multiDownLoad($urls, $threadCount = 0){
$result = [];
if($threadCount <= 0){
$threadCount = count($urls);
}
$rc = new RollingCurl(function($response, $info, $request) use(&$result){
if( $info["http_code"] == 200 && !empty($response)){
$result[$request->url] = $response;
}
});
$rc->window_size = $threadCount;
foreach ($urls as $url) {
$rc->get($url, null, self::$curlOpt);
}
$rc->execute();
return $result;
}
public static function download($url){
$result = null;
$rc = new RollingCurl(function($response, $info, $request) use(&$result){
if( $info["http_code"] == 200 && !empty($response)){
$result = $response;
}
});
$rc->get($url, null, self::$curlOpt);
$rc->execute();
return $result;
}
}
?>
public static function getParser($raw_content){
$rss = simplexml_load_string($raw_content, 'SimpleXMLElement', LIBXML_NOWARNING | LIBXML_NOERROR);
if(RSSParser::check($rss)){
return new RSSParser($rss);
}else if(AtomParser::check($rss)){
return new AtomParser($rss);
}else{
return null;
}
}
If you just need rss. That they understand as Xml files and all. You will get headings and links from there.
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question