O
O
Oleg2016-12-27 17:19:00
JavaScript
Oleg, 2016-12-27 17:19:00

Help with phantomJs crawler?

Greetings to those who responded to my problem.
Requires a crawler that takes a URL and follows the redirects and returns the last URL in the redirect chain.
But the code doesn't work. What's wrong with him?

var sys = require('system')
var page;
var myurl=( sys.args[1] ) ? sys.args[1] : phantom.exit(0);
var urls = [];

var renderPage = function (url) {
    page = require('webpage').create();
    page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36$'
  page.settings.loadImages = false;

    page.onNavigationRequested = function(url, type, willNavigate, main) {
        if (main && url!=myurl) {
            myurl = url;
            urls.push(url);
            page.close();
            renderPage(url);
        }
    };

    page.open(url, function(status) {
    	console.log(page.content);
    	phantom.exit(0);
        if (status==="success") {
            var newUrl = page.evaluate(function() {
        	var canonical = [].map.call(document.querySelectorAll('link[rel="canonical"]'), function(link) {
              return link.getAttribute('href');
          }).shift();

          if (!location.origin) location.origin = location.protocol + "//" + location.host;
             	
        var out = location.origin;
             	
         	if(typeof canonical != 'undefined'){
         		if(canonical.substring(0,1) == "/"){
            out += canonical;
         		} else {
            out = canonical;
          }
            } else {
                out = "{{home}}" + out;
            }

        	return out;
        
      });
      console.log(newUrl);
            phantom.exit(0);
        } else {
            phantom.exit(1);
        }
    });
} 

renderPage(myurl);

//setTimeout(function(){
//	var txt = document.createElement("textarea");
//    txt.innerHTML = page.content;
//    console.log(txt.value);
//    phantom.exit(0);
//	var newUrl = page.evaluate(function() {
//	  	var canonical = [].map.call(document.querySelectorAll('link[rel="canonical"]'), function(link) {
//	        return link.getAttribute('href');
//	    }).shift();
//
//	    if (!location.origin) location.origin = location.protocol + "//" + location.host;
//	       	
//		var out = location.origin;
//	       	
//	   	if(typeof canonical != 'undefined'){
//	   		if(canonical.substring(0,1) == "/"){
//				out += canonical;
//	   		} else {
//				out = canonical;
//			}
//        } else {
//            out = "{{home}}" + out;
//        }
//
//	  	return out;
//	  
//	});
//	console.log(newUrl);
//    phantom.exit(0);
//},5000);

Answer the question

In order to leave comments, you need to log in

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question