Advertisement
Guest User

Untitled

a guest
Jan 17th, 2019
45
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.15 KB | None | 0 0
  1.  
  2. var redirect = true;
  3. //everything is wrapped in an IIFE so we can use async/await in the 'outer' scope
  4. ( async _ => {
  5.  
  6. // const serverURL = process.argv[3];
  7. const serverURL = 'http://server.address.here',
  8. puppeteer = require('puppeteer'),
  9. deviceTypes = require('puppeteer/DeviceDescriptors'),
  10. WPAPI = require( 'wpapi' ),
  11. //bash will pass either 'all' (non simultaneous run) , or one of 'mobile', 'tablet', 'desktop' (simultaneous run)
  12. allDeviceTypes = [ 'mobile', 'desktop' ],
  13. device = process.argv[2],
  14. SIMULTANEOUS = ( device === 'all' ) ? false : true;
  15.  
  16. let postsScanned = 0,
  17. badAdsFound = 0;
  18.  
  19. const postURLs = await getPostURLs();
  20. const postCount = postURLs.length;
  21. let currentPost = 0;
  22.  
  23. const deviceCount = device === 'all' ? allDeviceTypes.length : 0;
  24. let currentDevice = 0;
  25.  
  26. await scraperController( device );
  27.  
  28. async function scraperController( device ) {
  29. if ( SIMULTANEOUS ) {
  30. //just run a single device (bash script is using concurrency)
  31. scrape( postURLs[currentPost], device );
  32. } else {
  33. scrape( postURLs[currentPost], allDeviceTypes[currentDevice] );
  34. }
  35. }
  36.  
  37. function finishScrape() {
  38. currentDevice += 1;
  39. if ( SIMULTANEOUS || (currentDevice == deviceCount) ) {
  40. //all done, log results out to be consumed by BASH script
  41. const resultsJSON = JSON.stringify({
  42. "number_posts_scanned": postsScanned,
  43. "number_bad_ads_found": badAdsFound
  44. });
  45. console.log( resultsJSON );
  46. } else {
  47. //reset post count, run next device
  48. currentPost = 0;
  49. try {
  50. scraperController( allDeviceTypes[ currentDevice ] );
  51. } catch(e) {
  52. errorHandler(e);
  53. }
  54. }
  55. }
  56.  
  57. async function getPostURLs() {
  58. // Get the most recent 100 post URLs from thechive, plus homepage
  59. let postURLs = [ 'http://thechive.com/' ],
  60. chiveWPAPI = new WPAPI({ endpoint: 'http://thechive.com/wp-json/' });
  61.  
  62. await chiveWPAPI.posts().param( 'orderby', 'date' ).param( 'per_page', 50 ).then( async data => {
  63. await data.forEach( post => {
  64. postURLs.push( post.link );
  65. });
  66. }).catch( e => { errorHandler(e) });
  67. return postURLs;
  68. }
  69.  
  70. async function getEvilPageData( link, pageAds, browser, page, device, success ) {
  71. //this is where we take a screenshot, and set the 'redirect_url', 'screenshot_url', and 'user_agent'
  72. //also increment badads and postsscanned counts
  73. badAdsFound += 1;
  74.  
  75. const timestamp = Math.floor(Date.now() / 1000);
  76.  
  77. if ( success ) {
  78. screenshot_url = `${serverURL}/scraper/screenshots/${timestamp}-${device}.jpg`;
  79. } else {
  80. screenshot_url = `evil page timeout, no screenshot available`;
  81. }
  82.  
  83. redirect_url = await page.url();
  84.  
  85. await page.screenshot( {
  86. path: `screenshots/${timestamp}-${device}.jpg`,
  87. fullPage: true,
  88. }).catch( (e) => { errorHandler(e) });
  89.  
  90. user_agent = await browser.userAgent();
  91.  
  92. try {
  93. sendAdToWP( browser, page, link, device, pageAds );
  94. } catch(e) {
  95. errorHandler(e);
  96. }
  97. }
  98.  
  99. async function sendAdToWP( browser, page, link, device, pageAds ) {
  100. // this function will send object to WP as JSON then kill the browser
  101.  
  102. let adData = {
  103. line_item: [],
  104. location: [],
  105. prebid: [],
  106. //currently an array, but should be changed to string in the future
  107. screenshot_url: [],
  108. url: link,
  109. redirect_url: '',
  110. user_agent: ''
  111. };
  112.  
  113. //adGroups are for the 2 different PBJS functions, '.getAllWinningBids()' and '.getAllPrebidWinningBids()'
  114. for ( let adGroup in pageAds ) {
  115. if ( pageAds[ adGroup ].length !== 0 ) {
  116. pageAds[ adGroup ].forEach( ad => {
  117. adData.prebid.push( JSON.stringify(ad) );
  118. adData.line_item.push( ad.adId );
  119. adData.location.push( ad.adUnitCode );
  120. adData.screenshot_url.push( screenshot_url );
  121. adData.redirect_url = redirect_url;
  122. adData.user_agent = user_agent;
  123. });
  124. }
  125. }
  126.  
  127. const chiveBadAds = await new WPAPI( {
  128. endpoint: 'https://adstage.thechive.com/wp-json'
  129. });
  130.  
  131. //register route for CPT 'ad'
  132. chiveBadAds.ad = chiveBadAds.registerRoute('wp/v2', '/ad/(?P<id>)');
  133. const postTitle = `${link}-${device}-${ Date().toLocaleString() }`;
  134.  
  135. const badAdPost = {
  136. title: postTitle,
  137. type: "ad",
  138. status: "publish",
  139. cmb2: {
  140. "ad-data": adData
  141. }
  142. };
  143.  
  144. await chiveBadAds.ad().create( badAdPost ).catch( e => { errorHandler(e) });
  145.  
  146. await page.evaluate( _ => {
  147. window.endOfPage();
  148. }).catch( (e) => { errorHandler(e) } );;
  149. }
  150.  
  151. // function inPageActions( page ) {
  152. // page.evaluate( () => {
  153. // //scroll the page by window height every 1.5 seconds until the bottom is reached
  154. // let documentHeight = document.body.offsetHeight,
  155. // windowHeight = window.innerHeight;
  156.  
  157. // //scroll through the page to load lazy-loading ads
  158. // // let intID = window.setInterval( async () => {
  159. // // window.scrollBy( 0, windowHeight );
  160. // // if ( (windowHeight + window.scrollY) >= documentHeight ) {
  161. // // clearInterval( intID );
  162. // // window.endOfPage();
  163. // // }
  164. // // }, 1500);
  165.  
  166. // setTimeout( () => {
  167. // window.endOfPage();
  168. // }, 10000);
  169.  
  170. // // testing a 'fake' 'redirect'
  171. // // setTimeout( _=> {
  172. // // window.top.location.replace("http://ponyfoo.com");
  173. // // }, 10000);
  174. // }).catch( (e) => { errorHandler(e) });
  175. // }
  176.  
  177. async function scrape( link, device ) {
  178.  
  179. const browser = await puppeteer.launch( { headless: false } ),
  180. page = await browser.newPage(),
  181. finished = false;
  182. let pageAds,
  183. screenshot_url,
  184. redirect_url,
  185. user_agent;
  186.  
  187. if ( device === 'desktop' ) {
  188. // no preset 'desktop' device, so just set a desktop resolution
  189. await page.setViewport( { width: 1260, height: 680 } );
  190. } else if ( device === 'mobile' ) {
  191. await page.emulate( deviceTypes['Pixel 2'] ).catch( e => { errorHandler(e) });
  192. } else if ( device === 'tablet' ) {
  193. await page.emulate( deviceTypes['iPad'] ).catch( e => { errorHandler(e) });
  194. }
  195.  
  196. //****** set up listeners on the page object ********
  197.  
  198. //accept the dialog box that will appear before an external navigation
  199. page.on('dialog', async (dialog) => {
  200. // allow external navigation, as ad data has already been capture on 'beforeunload'
  201. if ( dialog.type() === 'beforeunload' ) {
  202. dialog.accept();
  203. }
  204. });
  205.  
  206. // when external navigation is detected, hook into 'load' to grab evil page data and send to WP (PBJS data already obtained at this point)
  207. page.on( 'response', (res) => {
  208. if ( res.status() == 200 & 'document' === res.request().resourceType() && ! res.request().url().includes( link ) && res.request()._frame.parentFrame() === null ) {
  209. // wait for evil page to load, then capture info
  210. page.on( 'load', async _ => {
  211. try {
  212. await getEvilPageData( link, pageAds, browser, page, device, true );
  213. } catch(e) {
  214. errorHandler(e);
  215. }
  216. });
  217. }
  218. });
  219.  
  220. //called from page context to pass ad data to Puppeteer
  221. await page.exposeFunction( 'getAdData', async (ads) => {
  222. pageAds = ads;
  223. }).catch( (e) => { errorHandler(e); });
  224.  
  225. //gets called in context of page to signal end of current post, time to move on
  226. await page.exposeFunction( 'endOfPage', async () => {
  227.  
  228. postsScanned += 1;
  229. currentPost += 1;
  230.  
  231. await page.close();
  232. await browser.close();
  233.  
  234. //recursion
  235. if ( currentPost < postCount ) {
  236. try {
  237. scrape( postURLs[currentPost], device );
  238. } catch(e) {
  239. errorHandler(e);
  240. }
  241. } else {
  242. finishScrape();
  243. }
  244. });
  245.  
  246. //********** visit the page, add listener to catch 'beforeunload' event and grab appropriate ad data, scroll thru page ********
  247.  
  248. //visit the page on theChive
  249. await page.goto( link, {
  250. timeout: 30000,
  251. waitUntil: 'domcontentloaded'
  252. }).catch( e => { errorHandler(e) });
  253.  
  254. await page.evaluate( () => {
  255. window.addEventListener( 'beforeunload', async (event) => {
  256. event.preventDefault();
  257. //first function gets winners that have rendered, second gets winners but not rendered yet. I've found the results with respect to
  258. //rendered vs. unrendered to be unreliable, so just grab everything
  259. let winners = await pbjs.getAllWinningBids(),
  260. pb_winners = await pbjs.getAllPrebidWinningBids(),
  261. ads = {
  262. winners,
  263. pb_winners
  264. };
  265. window.getAdData( ads );
  266. //return value is required by Chrome
  267. event.returnValue = 'stop!';
  268. });
  269. }).catch( (e) => { errorHandler(e) } );
  270.  
  271. await page.evaluate( (redirect) => {
  272. //scroll the page by window height every 1.5 seconds until the bottom is reached
  273. let documentHeight = document.body.offsetHeight,
  274. windowHeight = window.innerHeight;
  275.  
  276. //scroll through the page to load lazy-loading ads
  277. // let intID = window.setInterval( async () => {
  278. // window.scrollBy( 0, windowHeight );
  279. // if ( (windowHeight + window.scrollY) >= documentHeight ) {
  280. // clearInterval( intID );
  281. // window.endOfPage();
  282. // }
  283. // }, 1500);
  284.  
  285. if ( redirect ) {
  286. // testing a 'fake' 'redirect'
  287. setTimeout( _=> {
  288. window.top.location.replace("http://ponyfoo.com");
  289. }, 10000);
  290. } else {
  291. setTimeout( () => {
  292. window.endOfPage();
  293. }, 10000);
  294. }
  295.  
  296. redirect = !redirect
  297.  
  298. }, redirect ).catch( (e) => { errorHandler(e) });
  299.  
  300. }
  301.  
  302. //when the script fails, we log out the error (non-json, which bash will treat as a failture) and exit node.js
  303. function errorHandler(e) {
  304. console.log(e);
  305. process.exit();
  306. }
  307.  
  308. })();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement