Where can I find the script (parser/grabber) of rss feeds?

S

Stannis Romanov2015-10-13 20:51:45

CMS

Stannis Romanov, 2015-10-13 20:51:45

Salut, please tell me a script, or an engine capable of parsing the required list of sites and robbing their latest news, or rather headlines and links to them? Something like Google but easier :).
If this is written individually, then please tell me the lessons or advise how to implement it, (Googled, but didn’t find anything suitable) thanks!

Reply

Answer the question

In order to leave comments, you need to log in

5 answer(s)

M

MetaDone, 2015-10-14
@MetaDone

There is one craft, the description can be viewed here
The site working on it is here
If you are interested, write, in extreme cases we will finish it for specific needs or set up and instruct

F

frees2, 2015-10-13
@frees2

dulsky.eu

A

Alexander Taratin, 2015-10-13
@Taraflex

Shit code from an old project

rssparser.class.php

<?php

require_once('parserutils.class.php');

class RSSParser implements Iterator {
    private $position = 0;
    private $rss = []; 

    public function __construct($rss) {
        $this->position = 0;
        $tmp = ParserUtils::normalizeXML($rss->channel);
        if(!empty($tmp['item'])){
        	$this->rss = $tmp['item'];
        }        
    }

    function rewind() {
        $this->position = 0;
    }

    function current() {
    	$c = $this->rss[$this->position];

        return ParserUtils::constructRssItem(
      $c->title,
      $c->link,
      $c->description,
      $c->pubDate
        );
    }

    function key() {
        return $this->position;
    }

    function next() {
        ++$this->position;
    }

    function valid() {
        return isset($this->rss[$this->position]);
    }

    public static function check($rss){
    	return !(empty($rss) || empty($rss->channel) || empty($rss->channel->item));
    }
}
?>

atomparser.class.php

<?php
class AtomParser implements Iterator {
    private $position = 0;
    private $rss = []; 

    public function __construct($rss) {
        $this->position = 0;
        
        $tmp = ParserUtils::normalizeXML($rss);
        if(!empty($tmp['entry'])){
            $this->rss = $tmp['entry'];
        } 
    }

    function rewind() {
        $this->position = 0;
    }

    function current() {
    	$c = $this->rss[$this->position];

    	$lastLink = null;
    foreach ($c->link as $vl) {
      $lastLink = $vl;
      if($vl['type'] == "text/html"){
        break;
      }
    }

        return ParserUtils::constructRssItem(
      $c->title,
      $lastLink["href"],
      $c->content,
      $c->updated
        );
    }

    function key() {
        return $this->position;
    }

    function next() {
        ++$this->position;
    }

    function valid() {
        return isset($this->rss[$this->position]);
    }

    public static function check($rss){
    	return !(empty($rss) || empty($rss->entry));
    }
}
?>

parserutils.class.php

<?php

require_once('RollingCurl.php');

class ParserUtils
{
  final private function __construct() {}
    final private function __clone() {}

    static $curlOpt = [CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17"];

  public static function normalizeXML ($xmlObject)
  {
    $res = [];
      foreach ( (array) $xmlObject as $index => $node ){
          $res[$index] = ( is_object ( $node ) ) ? self::normalizeXML ( $node ) : $node;
      }

      return $res;
  }

  public static function constructRssItem($title, $link, $content, $update){
    $content = preg_replace('/\[crayon-.+\]/U','',trim($content));
    $title = trim(strip_tags($title));
    $update = date('Y-m-d H:i:s',strtotime($update));
    return $res = [
    		'content'=>$content,
    		'update'=>$update,
    		'title'=>$title,
    		'link'=>trim(strip_tags($link))
    	];
  }

  public static function multiDownLoad($urls, $threadCount = 0){
    $result = [];
    if($threadCount <= 0){
      $threadCount = count($urls);
    }
    $rc = new RollingCurl(function($response, $info, $request) use(&$result){
      if( $info["http_code"] == 200 && !empty($response)){
        $result[$request->url] = $response;				
      }
    });
    $rc->window_size = $threadCount;
    foreach ($urls as $url) {
        $rc->get($url, null, self::$curlOpt);
    }
    $rc->execute();
    return $result;
  }

  public static function download($url){
    $result = null;
    $rc = new RollingCurl(function($response, $info, $request) use(&$result){
      if( $info["http_code"] == 200 && !empty($response)){
        $result = $response;				
      }
    });
    $rc->get($url, null, self::$curlOpt);
    $rc->execute();
    return $result;
  }
}

?>

Usage

public static function getParser($raw_content){

    $rss = simplexml_load_string($raw_content, 'SimpleXMLElement', LIBXML_NOWARNING | LIBXML_NOERROR);

    if(RSSParser::check($rss)){
      return new RSSParser($rss);
    }else if(AtomParser::check($rss)){
      return new AtomParser($rss);
    }else{
      return null;
    }
  }

V

Vlad Zhivotnev, 2015-10-14
@inkvizitor68sl

https://wordpress.org/plugins/wp-rss-aggregator/

J

jacksparrow, 2015-10-13
@jacksparrow

If you just need rss. That they understand as Xml files and all. You will get headings and links from there.