javascript - How can I change this PhantomJS script to NodeJS for web scraping -
this phantomjs script use scraping html dom in web page. use waiting dom ready
//scrap_phantom.js var server = require("webserver").create(); var page = require("webpage").create(); var port = require('system').env.port || 3000; var url = "http://www.example.com"; server.listen(port, function (request, response) { function onpageready() { var htmlcontent = page.evaluate(function () { return document.documentelement.outerhtml; }); response.write(htmlcontent); response.close(); phantom.exit(); } page.open(url, function (status) { function checkreadystate() { settimeout(function () { var readystate = page.evaluate(function () { return document.readystate; }); if ("complete" === readystate) { onpageready(); } else { checkreadystate(); } }); } checkreadystate(); }); });
i'm testing in cmd "phantomjs scrap_phantom.js". above code work. now, change code nodejs script this:
//scrap_node.js var http = require("http"); var phantom = require('phantom'); var url = "http://www.example.com"; http.createserver(function(request, response) { function onpageready() { var htmlcontent = page.evaluate(function () { return document.documentelement.outerhtml; }); response.write(htmlcontent); response.close(); phantom.exit(); } phantom.create(function (ph) { return ph.createpage(function (page) { page.open(url, function (status) { function checkreadystate() { settimeout(function () { var readystate = page.evaluate(function () { return document.readystate; }); if ("complete" === readystate) { onpageready(); } else { checkreadystate(); } }); } checkreadystate(); }); }); }, { dnodeopts: {weak: false} }); }).listen(3000);
i'm testing in cmd "node scrap_node.js". code not work me. it's loading long time , doesn't return errors. why not working me?
there multiple issues code.
phantomjs-node bridge between node.js , phantomjs. uses different syntax , none of functions synchronous. means if write in phantomjs:
var result = page.evaluate(function(arg1, arg2){ //...1 return stuff; }, "arg1", "arg2"); //...2
then equivalent in phantomjs-node (see functionality details) this:
page.evaluate(function(arg1, arg2){ //...1 return stuff; }, function(result){ //...2 }, "arg1", "arg2");
it inherently asynchronous.
the other thing phantom
doesn't have exit
function, ph
does.
furthermore, settimeout(function(){...})
isn't doing useful. need pass timeout value useful.
Comments
Post a Comment