Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- var redirect = true;
- //everything is wrapped in an IIFE so we can use async/await in the 'outer' scope
- ( async _ => {
- // const serverURL = process.argv[3];
- const serverURL = 'http://server.address.here',
- puppeteer = require('puppeteer'),
- deviceTypes = require('puppeteer/DeviceDescriptors'),
- WPAPI = require( 'wpapi' ),
- //bash will pass either 'all' (non simultaneous run) , or one of 'mobile', 'tablet', 'desktop' (simultaneous run)
- allDeviceTypes = [ 'mobile', 'desktop' ],
- device = process.argv[2],
- SIMULTANEOUS = ( device === 'all' ) ? false : true;
- let postsScanned = 0,
- badAdsFound = 0;
- const postURLs = await getPostURLs();
- const postCount = postURLs.length;
- let currentPost = 0;
- const deviceCount = device === 'all' ? allDeviceTypes.length : 0;
- let currentDevice = 0;
- await scraperController( device );
- async function scraperController( device ) {
- if ( SIMULTANEOUS ) {
- //just run a single device (bash script is using concurrency)
- scrape( postURLs[currentPost], device );
- } else {
- scrape( postURLs[currentPost], allDeviceTypes[currentDevice] );
- }
- }
- function finishScrape() {
- currentDevice += 1;
- if ( SIMULTANEOUS || (currentDevice == deviceCount) ) {
- //all done, log results out to be consumed by BASH script
- const resultsJSON = JSON.stringify({
- "number_posts_scanned": postsScanned,
- "number_bad_ads_found": badAdsFound
- });
- console.log( resultsJSON );
- } else {
- //reset post count, run next device
- currentPost = 0;
- try {
- scraperController( allDeviceTypes[ currentDevice ] );
- } catch(e) {
- errorHandler(e);
- }
- }
- }
- async function getPostURLs() {
- // Get the most recent 100 post URLs from thechive, plus homepage
- let postURLs = [ 'http://thechive.com/' ],
- chiveWPAPI = new WPAPI({ endpoint: 'http://thechive.com/wp-json/' });
- await chiveWPAPI.posts().param( 'orderby', 'date' ).param( 'per_page', 50 ).then( async data => {
- await data.forEach( post => {
- postURLs.push( post.link );
- });
- }).catch( e => { errorHandler(e) });
- return postURLs;
- }
- async function getEvilPageData( link, pageAds, browser, page, device, success ) {
- //this is where we take a screenshot, and set the 'redirect_url', 'screenshot_url', and 'user_agent'
- //also increment badads and postsscanned counts
- badAdsFound += 1;
- const timestamp = Math.floor(Date.now() / 1000);
- if ( success ) {
- screenshot_url = `${serverURL}/scraper/screenshots/${timestamp}-${device}.jpg`;
- } else {
- screenshot_url = `evil page timeout, no screenshot available`;
- }
- redirect_url = await page.url();
- await page.screenshot( {
- path: `screenshots/${timestamp}-${device}.jpg`,
- fullPage: true,
- }).catch( (e) => { errorHandler(e) });
- user_agent = await browser.userAgent();
- try {
- sendAdToWP( browser, page, link, device, pageAds );
- } catch(e) {
- errorHandler(e);
- }
- }
- async function sendAdToWP( browser, page, link, device, pageAds ) {
- // this function will send object to WP as JSON then kill the browser
- let adData = {
- line_item: [],
- location: [],
- prebid: [],
- //currently an array, but should be changed to string in the future
- screenshot_url: [],
- url: link,
- redirect_url: '',
- user_agent: ''
- };
- //adGroups are for the 2 different PBJS functions, '.getAllWinningBids()' and '.getAllPrebidWinningBids()'
- for ( let adGroup in pageAds ) {
- if ( pageAds[ adGroup ].length !== 0 ) {
- pageAds[ adGroup ].forEach( ad => {
- adData.prebid.push( JSON.stringify(ad) );
- adData.line_item.push( ad.adId );
- adData.location.push( ad.adUnitCode );
- adData.screenshot_url.push( screenshot_url );
- adData.redirect_url = redirect_url;
- adData.user_agent = user_agent;
- });
- }
- }
- const chiveBadAds = await new WPAPI( {
- endpoint: 'https://adstage.thechive.com/wp-json'
- });
- //register route for CPT 'ad'
- chiveBadAds.ad = chiveBadAds.registerRoute('wp/v2', '/ad/(?P<id>)');
- const postTitle = `${link}-${device}-${ Date().toLocaleString() }`;
- const badAdPost = {
- title: postTitle,
- type: "ad",
- status: "publish",
- cmb2: {
- "ad-data": adData
- }
- };
- await chiveBadAds.ad().create( badAdPost ).catch( e => { errorHandler(e) });
- await page.evaluate( _ => {
- window.endOfPage();
- }).catch( (e) => { errorHandler(e) } );;
- }
- // function inPageActions( page ) {
- // page.evaluate( () => {
- // //scroll the page by window height every 1.5 seconds until the bottom is reached
- // let documentHeight = document.body.offsetHeight,
- // windowHeight = window.innerHeight;
- // //scroll through the page to load lazy-loading ads
- // // let intID = window.setInterval( async () => {
- // // window.scrollBy( 0, windowHeight );
- // // if ( (windowHeight + window.scrollY) >= documentHeight ) {
- // // clearInterval( intID );
- // // window.endOfPage();
- // // }
- // // }, 1500);
- // setTimeout( () => {
- // window.endOfPage();
- // }, 10000);
- // // testing a 'fake' 'redirect'
- // // setTimeout( _=> {
- // // window.top.location.replace("http://ponyfoo.com");
- // // }, 10000);
- // }).catch( (e) => { errorHandler(e) });
- // }
- async function scrape( link, device ) {
- const browser = await puppeteer.launch( { headless: false } ),
- page = await browser.newPage(),
- finished = false;
- let pageAds,
- screenshot_url,
- redirect_url,
- user_agent;
- if ( device === 'desktop' ) {
- // no preset 'desktop' device, so just set a desktop resolution
- await page.setViewport( { width: 1260, height: 680 } );
- } else if ( device === 'mobile' ) {
- await page.emulate( deviceTypes['Pixel 2'] ).catch( e => { errorHandler(e) });
- } else if ( device === 'tablet' ) {
- await page.emulate( deviceTypes['iPad'] ).catch( e => { errorHandler(e) });
- }
- //****** set up listeners on the page object ********
- //accept the dialog box that will appear before an external navigation
- page.on('dialog', async (dialog) => {
- // allow external navigation, as ad data has already been capture on 'beforeunload'
- if ( dialog.type() === 'beforeunload' ) {
- dialog.accept();
- }
- });
- // when external navigation is detected, hook into 'load' to grab evil page data and send to WP (PBJS data already obtained at this point)
- page.on( 'response', (res) => {
- if ( res.status() == 200 & 'document' === res.request().resourceType() && ! res.request().url().includes( link ) && res.request()._frame.parentFrame() === null ) {
- // wait for evil page to load, then capture info
- page.on( 'load', async _ => {
- try {
- await getEvilPageData( link, pageAds, browser, page, device, true );
- } catch(e) {
- errorHandler(e);
- }
- });
- }
- });
- //called from page context to pass ad data to Puppeteer
- await page.exposeFunction( 'getAdData', async (ads) => {
- pageAds = ads;
- }).catch( (e) => { errorHandler(e); });
- //gets called in context of page to signal end of current post, time to move on
- await page.exposeFunction( 'endOfPage', async () => {
- postsScanned += 1;
- currentPost += 1;
- await page.close();
- await browser.close();
- //recursion
- if ( currentPost < postCount ) {
- try {
- scrape( postURLs[currentPost], device );
- } catch(e) {
- errorHandler(e);
- }
- } else {
- finishScrape();
- }
- });
- //********** visit the page, add listener to catch 'beforeunload' event and grab appropriate ad data, scroll thru page ********
- //visit the page on theChive
- await page.goto( link, {
- timeout: 30000,
- waitUntil: 'domcontentloaded'
- }).catch( e => { errorHandler(e) });
- await page.evaluate( () => {
- window.addEventListener( 'beforeunload', async (event) => {
- event.preventDefault();
- //first function gets winners that have rendered, second gets winners but not rendered yet. I've found the results with respect to
- //rendered vs. unrendered to be unreliable, so just grab everything
- let winners = await pbjs.getAllWinningBids(),
- pb_winners = await pbjs.getAllPrebidWinningBids(),
- ads = {
- winners,
- pb_winners
- };
- window.getAdData( ads );
- //return value is required by Chrome
- event.returnValue = 'stop!';
- });
- }).catch( (e) => { errorHandler(e) } );
- await page.evaluate( (redirect) => {
- //scroll the page by window height every 1.5 seconds until the bottom is reached
- let documentHeight = document.body.offsetHeight,
- windowHeight = window.innerHeight;
- //scroll through the page to load lazy-loading ads
- // let intID = window.setInterval( async () => {
- // window.scrollBy( 0, windowHeight );
- // if ( (windowHeight + window.scrollY) >= documentHeight ) {
- // clearInterval( intID );
- // window.endOfPage();
- // }
- // }, 1500);
- if ( redirect ) {
- // testing a 'fake' 'redirect'
- setTimeout( _=> {
- window.top.location.replace("http://ponyfoo.com");
- }, 10000);
- } else {
- setTimeout( () => {
- window.endOfPage();
- }, 10000);
- }
- redirect = !redirect
- }, redirect ).catch( (e) => { errorHandler(e) });
- }
- //when the script fails, we log out the error (non-json, which bash will treat as a failture) and exit node.js
- function errorHandler(e) {
- console.log(e);
- process.exit();
- }
- })();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement