通过 phantomjs抓取仁医在线的练习题
作者:互联网
先模拟登录,再按照指定课程挨个去抓取,还有点不完善,会有重复题目出现。
var page = require('webpage').create(); phantom.outputEncoding = 'gbk'; page.settings.userAgent = 'chrome'; page.onConsoleMessage = function(msg) { console.log(msg); }; function getBetween(targetString, beginString, endString) { if (!targetString) { return targetString; } if (!beginString && !endString) { return targetString; } if (!beginString) { var i = targetString.indexOf(endString); if (i < 0) { return ''; } return targetString.substring(0, i); } else if (!endString) { var i = targetString.indexOf(beginString); if (i < 0) { return ''; } return targetString.substring(i + beginString.length); } else { var i = targetString.indexOf(beginString); if (i < 0) { return null; } var j = targetString.indexOf(endString, i + beginString.length); if (j < 0) { return null; } return targetString.substring(i + beginString.length, j); } } var fs = require('fs'); var entryList = [ 2684,230 ,2685,145 ,2686,235 ,2687,237 ,2688,224 ,2689,117 ,2690,120 ,2691,79 ,2692,80 ,2693,40 ,2694,70 ,2695,80 ,2696,40 ,2697,38 ,2698,90 ]; var sId = 2683; page.open('http://www.renyiwang.net/Mobile/Login.aspx', function(status) { if (status !== 'success') { console.log('Unable to access Login Page, status is ' + status + '!'); phantom.exit(); } else { console.log('success to open Login Page, status is ' + status + '!'); page.evaluate(function() { document.getElementById('TstNumber').value = "用户名"; document.getElementById('TstPassword').value = "密码"; document.getElementById('But_Login').click(); }); setTimeout(function() { var pageHtml = page.evaluate(function() { return document.body.innerHTML; }); if (pageHtml && pageHtml.indexOf("三基培训") > 0) { console.log('success to login'); var captureQuestion = function(entryIndex, qIndex) { qIndex++; page.open('http://www.renyiwang.net/Mobile/Practice.aspx?o_id=6&SelQuesetions='+sId+'&q_id=' + entryList[entryIndex * 2] + '&class=0', function(status){ if (status !== 'success') { console.log('Unable to access Practice Page, status is ' + status + '!'); phantom.exit(); } else { var pageHtml = page.evaluate(function() { //return document.getElementById('app1').innerHTML; return document.body.innerHTML; }); if (pageHtml && pageHtml.indexOf('Rad_T_A_Id') > 0) { //console.log('success to open practice page!'); var answerId = getBetween(pageHtml, 'id="Hid_Answer" value="', '"'); //console.log('answerId: ' + answerId); var questionTitle = getBetween(pageHtml, 'id="Hid_Choose" value="0">', '</div>'); if (questionTitle) { questionTitle = questionTitle.trim(); } else { console.log("ERROR: " + pageHtml); } var anserCode = ''; var optionList = []; var optionInfoList = pageHtml.match(new RegExp('Rad_T_A_Id_[\\d]+', 'g')); for (var i = 0; i < optionInfoList.length; i+=2) { var optionId = optionInfoList[i].replace('Rad_T_A_Id_', ''); var optionTitle = getBetween(pageHtml, optionInfoList[i] + '">', '</label>'); if (optionTitle) { optionList.push(optionTitle); if (answerId == optionId) { anserCode = optionTitle[0]; } } } var info = '第' + qIndex + '题:' + questionTitle + '\n' + optionList.join('\n') + '\n' + '答案:' + anserCode + '\n\n'; console.log(info); var categoryName = getBetween(pageHtml, '<span style="font-weight:bold;color:#808080;">', '</span>'); var fs = require('fs'); fs.write('d:\\' + categoryName + '.txt', info, 'a'); var maxCount = entryList[entryIndex * 2 + 1]; if (qIndex >= maxCount) { entryIndex++; if (entryIndex * 2 >= entryList.length) { console.log('finished!'); phantom.exit(); } else { qIndex = 0; page.open('http://www.renyiwang.net/Mobile/PracticeClear.aspx?o_id=6&SelQuesetions='+sId+'&q_id=' + entryList[entryIndex * 2], 'post', {}, function (status) { console.log('PracticeClear ' + categoryName); setTimeout(function(){ captureQuestion(entryIndex, qIndex); }, 5000); }); } } else { setTimeout(function(){ captureQuestion(entryIndex, qIndex); }, 1000); } } else { console.log(pageHtml); console.log('fail to open pratice page!'); phantom.exit(); } } }); }; captureQuestion(0, 0); } else { console.log(pageHtml); phantom.exit(); } }, 5000); } });
标签:练习题,function,console,log,phantomjs,pageHtml,var,return,仁医 来源: https://www.cnblogs.com/lavezhang/p/16336504.html