daily pastebin goal
53%
SHARE
TWEET

Untitled

a guest Feb 4th, 2016 97 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. var Zombie = require("zombie"),
  2.   USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36",
  3.   testUser = {
  4.     email: "***",
  5.     password: "***"
  6.   };
  7.  
  8.  
  9. var amazon = module.exports = {
  10.  
  11.   /*
  12.   * Signs in to the amazon dashboard and returns
  13.   * an authenticated session to be continued for
  14.   * whatever purpose
  15.   */
  16.   authenticate: function(user, callback) {
  17.  
  18.     var browser = new Zombie({
  19.       loadCSS: false,
  20.       runScripts: false,
  21.       maxWait: 30,
  22.       waitFor: 1000,
  23.       userAgent: USERAGENT,
  24.       windowName: "chrome",
  25.       site: "https://developer.amazon.com/"
  26.     });
  27.  
  28.     // go to the sign in page
  29.     setLocation(browser, user, "/login.html", function(error, browser) {
  30.  
  31.       if (error) {
  32.         console.log("[authenticate] error signing in", error, browser.error);
  33.         return callback(error || browser.error || "Missing page body");
  34.       }
  35.  
  36.       // do the sign in
  37.       browser
  38.         .fill("#ap_email", user.email)
  39.         .fill("#ap_password", user.password)
  40.         .pressButton("#signInSubmit-input", function(error) {
  41.           callback(null, browser);
  42.         });
  43.     });
  44.   },
  45.  
  46.   /**
  47.   * Scrapes a complete list of the user's games with
  48.   * gameId and title data
  49.   */
  50.   gameList: function(user, browser, callback) {
  51.  
  52.     if (!browser || browser.location.pathname != "/myapps.html") {
  53.       return setLocation(browser, user, "/myapps.html", function(error, browser) {
  54.  
  55.         if (error) {
  56.           console.log("[gameList] error setting initial page", error, browser.error);
  57.           return callback(error || browser.error || "Missing page body");
  58.         }
  59.  
  60.         return amazon.gameList(user, browser, callback);
  61.       });
  62.     }
  63.  
  64.     console.log("[gameList] ready to start processing");
  65.  
  66.     // get the number of pages
  67.     var nav = browser.queryAll("nav[class=pagination]")[0];
  68.     var pageLinks = browser.queryAll("a", nav);
  69.     var pagePrefix = pageLinks[0].href;
  70.     pagePrefix = pagePrefix.substring(0, pagePrefix.indexOf("=") + 1);
  71.  
  72.     // scrape each page
  73.     var currentPage = 1;
  74.     var numPages = pageLinks.length + 1 - 4; // there are 2 << and < links a the start and end
  75.     var games = [];
  76.     var retries = 0;
  77.  
  78.     setLocation(browser, user, pagePrefix + currentPage, function(error, browser) {
  79.  
  80.       if (error) {
  81.  
  82.       }
  83.  
  84.     });
  85.  
  86.     scrape();
  87.  
  88.     function retry() {
  89.       browser.reload(scrape);
  90.     }
  91.  
  92.     function next() {
  93.       currentPage++;
  94.       browser.clickLink(pageLinks[currentPage], scrape);
  95.     }
  96.  
  97.     function scrape(error) {
  98.  
  99.       if (error || browserError(browser)) {
  100.         console.log("[gameList] error scraping page", currentPage, numPages, error, browser.error, browser.html());
  101.         retries++;
  102.  
  103.         if (retries < 10) {
  104.           console.log("[gameList] retrying page", currentPage, numPages);
  105.           return tryAgain(retry, retries);
  106.         } else {
  107.           return callback(error || browser.error);
  108.         }
  109.       }
  110.  
  111.       console.log("[gameList] scraping page", currentPage, numPages, browser.html());
  112.  
  113.       var rows = browser.queryAll("td[class=row]");
  114.       retries = 0;
  115.  
  116.       for (var i = 0; i < rows.length; i++) {
  117.         console.log(rows[i]);
  118.         var a = browser.query("a", rows[i]);
  119.         var span = browser.query("span", a);
  120.         var versionId = a.href.replace("https://developer.amazon.com/application/general/", "").replace("/detail.html", "");
  121.  
  122.         games.push({
  123.           title: span.innerHTML,
  124.           versionId: versionId
  125.         });
  126.       }
  127.  
  128.       if (currentPage < numPages) {
  129.         return setTimeout(next, sleep());
  130.       }
  131.  
  132.       // finished
  133.       console.log("[gameList] finished", games.length + " games");
  134.       return callback(null, browser, games);
  135.     }
  136.   },
  137.  
  138.   /**
  139.   * Scrapes a user's games and collects all the meta
  140.   * data for it.
  141.   */
  142.   gameData: function(user, games, browser, callback) {
  143.  
  144.     var gameData = {};
  145.     var gameIds = [];
  146.     var i;
  147.  
  148.     for (i = 0; i < games.length; i++) {
  149.       gameData[games[i].versionId] = games[i];
  150.       gameIds.push(games[i].versionId);
  151.     }
  152.  
  153.     function scrape() {
  154.  
  155.       var gameId = gameIds.shift();
  156.       var game = gameData[gameId];
  157.       var jobs = [pricing, text, general, releaseNotes, contentRatings, media];
  158.  
  159.       setLocation(browser, user, "/application/general/" + gameId + "/detail.html", function(error, browser) {
  160.  
  161.         if (error) {
  162.           console.log("[gameList] error setting initial page", error, browser.error);
  163.           return callback(error || browser.error || "Missing page body");
  164.         }
  165.  
  166.         next();
  167.       });
  168.  
  169.       function done() {
  170.  
  171.         if (jobs.length === 0) {
  172.  
  173.           // no more games
  174.           if (gameIds.length === 0) {
  175.  
  176.             games = [];
  177.  
  178.             for (var game in gameData) {
  179.               games.push(game);
  180.             }
  181.  
  182.             return callback(null, browser, games);
  183.           }
  184.  
  185.           // next game
  186.           return setTimeout(scrape, sleep());
  187.         }
  188.  
  189.         // next job
  190.         return setTimeout(next, sleep());
  191.       }
  192.  
  193.       function next() {
  194.         return jobs.shift()(gameId, game, done);
  195.       }
  196.     }
  197.  
  198.     scrape();
  199.  
  200.     // pricing data
  201.     function pricing(gameId, game, done) {
  202.  
  203.       var retries = 0;
  204.       browser.clickLink("#header_nav_availability_pricing_a", process);
  205.  
  206.       function retry() {
  207.         browser.reload(process);
  208.       }
  209.  
  210.       function process(error) {
  211.  
  212.         if (error) {
  213.           console.log("[gameData.pricing] error processing page", error, browser.error);
  214.           retries++;
  215.  
  216.           if (retries < 10) {
  217.             console.log("[gameData.pricing] retrying page");
  218.             return tryAgain(retry, retries);
  219.           } else {
  220.             return callback(error || browser.error);
  221.           }
  222.         }
  223.  
  224.         game.pricing = { };
  225.  
  226.         var prow = browser.query("#ro_cal_prices");
  227.         var pValues = browser.queryAll("span[class=pricing-val]", prow);
  228.         var pCurrencies = browser.queryAll("span[class=pricing-currency]", prow);
  229.  
  230.         for (i = 0; i < pValues.length; i++) {
  231.  
  232.           var curr = pCurrencies[i].innerHTML.trim();
  233.           var price = pValues[i].innerHTML.trim().substring(1);
  234.           game.pricing[curr] = price;
  235.         }
  236.  
  237.         // TODO: availability
  238.  
  239.         done();
  240.       }
  241.     }
  242.  
  243.     // text data
  244.     function text(gameId, game, done) {
  245.  
  246.       var retries = 0;
  247.       browser.clickLink("#header_nav_description_a", process);
  248.  
  249.       function retry() {
  250.         browser.reload(process);
  251.       }
  252.  
  253.       function process(error) {
  254.  
  255.         if (error) {
  256.           console.log("[gameData.text] error processing page", error, browser.error);
  257.           retries++;
  258.  
  259.           if (retries < 10) {
  260.             console.log("[gameData.pricing] retrying page");
  261.             return tryAgain(retry, retries);
  262.           } else {
  263.             return callback(error || browser.error);
  264.           }
  265.         }
  266.  
  267.         // get the languages
  268.         var langList = browser.query("#collectable_nav_list");
  269.         var langLinks = browser.queryAll("a", langList);
  270.         var langArr = [];
  271.         game.languages = {};
  272.  
  273.         for (i = 0; i < langLinks.length; i++) {
  274.           var lName = langLinks[i].innerHTML;
  275.           var lid = langLinks[i].href.replace("https://developer.amazon.com/application/description/", "")
  276.             .replace("/detail.html", "");
  277.           game.languages[lid] = lName;
  278.           langArr.push(lid);
  279.         }
  280.  
  281.         var currLang = browser.query("span", langList).innerHTML.trim();
  282.  
  283.         game.languages[gameId] = {
  284.           language: currLang,
  285.           title: browser.query("#ro_display_title").innerHTML.trim(),
  286.           shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  287.           longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  288.           features: browser.query("#ro_bullets").innerHTML.trim(),
  289.           keywords: browser.query("#ro_keywords").innerHTML.trim()
  290.         };
  291.  
  292.         function scrapeLanguage() {
  293.           var langId = langArr.shift();
  294.  
  295.           browser.visit("/application/description/" + langId + "/detail.html", function(error) {
  296.  
  297.             if (error || browser.error) {
  298.               return callback(error || browser.error);
  299.             }
  300.  
  301.             game.languages[langId] = {
  302.               language: currLang,
  303.               title: browser.query("#ro_display_title").innerHTML.trim(),
  304.               shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  305.               longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  306.               features: browser.query("#ro_bullets").innerHTML.trim(),
  307.               keywords: browser.query("#ro_keywords").innerHTML.trim()
  308.             };
  309.  
  310.             if (langArr.length > 0) {
  311.               return setTimeout(scrapeLanguage, 500);
  312.             }
  313.  
  314.             return done();
  315.           });
  316.         }
  317.       }
  318.     }
  319.  
  320.     // general data
  321.     function general(gameId, game, done) {
  322.  
  323.       browser.visit("/application/general/" + gameId + "/detail.html", function(error) {
  324.  
  325.         if (error || browser.error) {
  326.           return callback(error || browser.error);
  327.         }
  328.  
  329.         game.title = browser.query("#ro_title").innerHTML.trim();
  330.         game.sku = browser.query("#ro_sku").innerHTML.trim();
  331.         game.privacy = browser.query("#ro_privacy_policy_url").innerHTLM.trim();
  332.         game.categories = browser.query("#ro_category").innerHTML.trim().split(" - ");
  333.         game.support = {
  334.           email: browser.query("#ro_support_email").innerHTLM.trim(),
  335.           phone: browser.query("#ro_support_phone").innerHTLM.trim(),
  336.           website: browser.query("#ro_support_website").innerHTLM.trim(),
  337.         };
  338.  
  339.         // app key doesn't have an ID on the td
  340.         var tds = browser.queryAll("td", browser.query("#ro_title").parentNode);
  341.  
  342.         for (i = 0; i < tds.length; i++) {
  343.           if (tds[i].innerHTML.indexOf("Application Key") > -1) {
  344.             game.appKey = tds[i + 1].innerHTLM.trim();
  345.             break;
  346.           }
  347.         }
  348.  
  349.         done();
  350.       });
  351.     }
  352.  
  353.     function releaseNotes(gameId, game, done) {
  354.  
  355.       browser.visit("/application/releasenotes/" + gameId + "/detail.html?default", function(error) {
  356.  
  357.         if (error || browser.error) {
  358.           return callback(error || browser.error);
  359.         }
  360.  
  361.         // get the languages
  362.         var langList = browser.query("#collectable_nav_list");
  363.         var lankLinks = browser.queryAll("a", langList);
  364.         var langArr = [];
  365.         game.languages = {};
  366.  
  367.         for (i = 0; i < languages.length; i++) {
  368.           var lName = languages[i].innerHTML;
  369.           var lid = languages[i].href.replace("https://developer.amazon.com/application/releasenotes/", "").replace("/detail.html", "");
  370.           languages[lid] = lName;
  371.           langArr.push(lid);
  372.         }
  373.  
  374.         var currLang = browser.query("span", langList).innerHTML.trim();
  375.  
  376.         game.languages[gameId] = {
  377.           language: currLang,
  378.           title: browser.query("#ro_display_title").innerHTML.trim(),
  379.           shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  380.           longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  381.           features: browser.query("#ro_bullets").innerHTML.trim(),
  382.           keywords: browser.query("#ro_keywords").innerHTML.trim()
  383.         };
  384.  
  385.         function scrapeLanguage() {
  386.           var langId = langArr.shift();
  387.  
  388.           browser.visit("/application/releasenotes/" + langId + "/detail.html", function(error) {
  389.  
  390.             if (error || browser.error) {
  391.               return callback(error || browser.error);
  392.             }
  393.  
  394.             game.languages[langId] = {
  395.               language: currLang,
  396.               releaseNotes: browser.query("#ro_release_notes").innerHTML.trim()
  397.             };
  398.  
  399.             if (langArr.length > 0) {
  400.               return setTimeout(scrapeLanguage, 500);
  401.             }
  402.  
  403.             return done();
  404.           });
  405.         }
  406.       });
  407.     }
  408.  
  409.     function contentRatings(gameId, game, done) {
  410.  
  411.       browser.visit("/application/rating/" + gameId + "/detail.html", function(error) {
  412.  
  413.         if (error || browser.error) {
  414.           return callback(error || browser.error);
  415.         }
  416.  
  417.         game.contentRatings = {
  418.           alcohol: browser.query("#ro_maturity_field_2").innerHTML.trim(),
  419.           cartoonViolence: browser.query("#ro_maturity_field_3").innerHTML.trim(),
  420.           intolerance: browser.query("#ro_maturity_field_4").innerHTML.trim(),
  421.           nudity: browser.query("#ro_maturity_field_7").innerHTML.trim(),
  422.           profanity: browser.query("#ro_maturity_field_8").innerHTML.trim(),
  423.           realisticViolence: browser.query("#ro_maturity_field_9").innerHTML.trim(),
  424.           sex: browser.query("#ro_maturity_field_10").innerHTML.trim()
  425.         };
  426.  
  427.         // other ratings
  428.         var others = browser.query("#ro_maturity_boolean_flags").innerHTML.trim();
  429.         game.contentRatings.others = {
  430.           pii: others.indexOf("Account creation or other personal information collection") > -1,
  431.           ads: others.indexOf("Advertisements") > -1,
  432.           gambling: others.indexOf("Gambling") > -1,
  433.           location: others.indexOf("Location detection or Location Based Services") > -1,
  434.           userContent: others.indexOf("User Generated Content or User to User Communication") > -1
  435.         };
  436.  
  437.         done();
  438.       });
  439.     }
  440.  
  441.     function media(gameId, game, done) {
  442.       done();
  443.     }
  444.   }
  445. };
  446.  
  447. /**
  448. * Sets the browser location to the necessary page
  449. */
  450. function setLocation(browser, user, url, callback, retries) {
  451.  
  452.   console.log("[setLocation] setting location", url, retries);
  453.  
  454.   retries = retries || 0;
  455.  
  456.   function retry() {
  457.     setLocation(browser, user, url, callback, retries);
  458.   }
  459.  
  460.   // get an authenticated session if we're not provided one
  461.   if (!browser) {
  462.  
  463.     return amazon.authenticate(user, function(error, browser) {
  464.  
  465.       if (error || browserError(browser)) {
  466.         retries++;
  467.  
  468.         if (retries < 10) {
  469.           return tryAgain(retry, retries);
  470.         } else {
  471.           return callback(error || browser.error || "No document body");
  472.         }
  473.       }
  474.  
  475.       return setLocation(browser, user, url, callback);
  476.     });
  477.   }
  478.  
  479.   browser.visit(url, function(error) {
  480.  
  481.     if (error || browserError(browser)) {
  482.       console.log("[setLocation] error visiting page", error, browser.error);
  483.       retries++;
  484.  
  485.       if (retries < 10) {
  486.         console.log("[setLocation] retrying");
  487.         return tryAgain(retry, retries);
  488.       }
  489.  
  490.       return callback(error || browser.error || "No document body", browser);
  491.     }
  492.  
  493.     callback(null, browser);
  494.   });
  495. }
  496.  
  497. /**
  498. * A sleep timer for some period of time for use between
  499. * pageviews
  500. */
  501. function sleep(min) {
  502.   return (1000 || min) + Math.floor(Math.random() * 2000);
  503. }
  504.  
  505. function tryAgain(func, numRetries) {
  506.   return setTimeout(func, sleep(numRetries * 1000));
  507. }
  508.  
  509. /**
  510. * Catches discrete errors where the document doesn't actually
  511. * load but an exception isn't thrown until you try and access
  512. * the body.
  513. */
  514. function browserError(browser) {
  515.  
  516.   if (browser === null) {
  517.     return true;
  518.   }
  519.  
  520.   try {
  521.     browser.queryAll("a");
  522.     return false;
  523.   } catch ( s ) {
  524.     return true;
  525.   }
  526. }
  527.  
  528. // get the game list
  529. amazon.gameList(testUser, null, function(error, browser, games) {
  530.  
  531.   if (error) {
  532.     return console.log("[amazon.gameList error]", error);
  533.   }
  534.  
  535.   // add data
  536.   amazon.gameData(testUser, games, browser, function(error, browser, games) {
  537.  
  538.     if (error) {
  539.       return console.log("[amazon.gameData error]", error);
  540.     }
  541.  
  542.     console.log(JSON.stringify(games[0]));
  543.   });
  544. });
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top