Advertisement
Guest User

Untitled

a guest
Feb 4th, 2016
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.70 KB | None | 0 0
  1. var Zombie = require("zombie"),
  2. USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36",
  3. testUser = {
  4. email: "***",
  5. password: "***"
  6. };
  7.  
  8.  
  9. var amazon = module.exports = {
  10.  
  11. /*
  12. * Signs in to the amazon dashboard and returns
  13. * an authenticated session to be continued for
  14. * whatever purpose
  15. */
  16. authenticate: function(user, callback) {
  17.  
  18. var browser = new Zombie({
  19. loadCSS: false,
  20. runScripts: false,
  21. maxWait: 30,
  22. waitFor: 1000,
  23. userAgent: USERAGENT,
  24. windowName: "chrome",
  25. site: "https://developer.amazon.com/"
  26. });
  27.  
  28. // go to the sign in page
  29. setLocation(browser, user, "/login.html", function(error, browser) {
  30.  
  31. if (error) {
  32. console.log("[authenticate] error signing in", error, browser.error);
  33. return callback(error || browser.error || "Missing page body");
  34. }
  35.  
  36. // do the sign in
  37. browser
  38. .fill("#ap_email", user.email)
  39. .fill("#ap_password", user.password)
  40. .pressButton("#signInSubmit-input", function(error) {
  41. callback(null, browser);
  42. });
  43. });
  44. },
  45.  
  46. /**
  47. * Scrapes a complete list of the user's games with
  48. * gameId and title data
  49. */
  50. gameList: function(user, browser, callback) {
  51.  
  52. if (!browser || browser.location.pathname != "/myapps.html") {
  53. return setLocation(browser, user, "/myapps.html", function(error, browser) {
  54.  
  55. if (error) {
  56. console.log("[gameList] error setting initial page", error, browser.error);
  57. return callback(error || browser.error || "Missing page body");
  58. }
  59.  
  60. return amazon.gameList(user, browser, callback);
  61. });
  62. }
  63.  
  64. console.log("[gameList] ready to start processing");
  65.  
  66. // get the number of pages
  67. var nav = browser.queryAll("nav[class=pagination]")[0];
  68. var pageLinks = browser.queryAll("a", nav);
  69. var pagePrefix = pageLinks[0].href;
  70. pagePrefix = pagePrefix.substring(0, pagePrefix.indexOf("=") + 1);
  71.  
  72. // scrape each page
  73. var currentPage = 1;
  74. var numPages = pageLinks.length + 1 - 4; // there are 2 << and < links a the start and end
  75. var games = [];
  76. var retries = 0;
  77.  
  78. setLocation(browser, user, pagePrefix + currentPage, function(error, browser) {
  79.  
  80. if (error) {
  81.  
  82. }
  83.  
  84. });
  85.  
  86. scrape();
  87.  
  88. function retry() {
  89. browser.reload(scrape);
  90. }
  91.  
  92. function next() {
  93. currentPage++;
  94. browser.clickLink(pageLinks[currentPage], scrape);
  95. }
  96.  
  97. function scrape(error) {
  98.  
  99. if (error || browserError(browser)) {
  100. console.log("[gameList] error scraping page", currentPage, numPages, error, browser.error, browser.html());
  101. retries++;
  102.  
  103. if (retries < 10) {
  104. console.log("[gameList] retrying page", currentPage, numPages);
  105. return tryAgain(retry, retries);
  106. } else {
  107. return callback(error || browser.error);
  108. }
  109. }
  110.  
  111. console.log("[gameList] scraping page", currentPage, numPages, browser.html());
  112.  
  113. var rows = browser.queryAll("td[class=row]");
  114. retries = 0;
  115.  
  116. for (var i = 0; i < rows.length; i++) {
  117. console.log(rows[i]);
  118. var a = browser.query("a", rows[i]);
  119. var span = browser.query("span", a);
  120. var versionId = a.href.replace("https://developer.amazon.com/application/general/", "").replace("/detail.html", "");
  121.  
  122. games.push({
  123. title: span.innerHTML,
  124. versionId: versionId
  125. });
  126. }
  127.  
  128. if (currentPage < numPages) {
  129. return setTimeout(next, sleep());
  130. }
  131.  
  132. // finished
  133. console.log("[gameList] finished", games.length + " games");
  134. return callback(null, browser, games);
  135. }
  136. },
  137.  
  138. /**
  139. * Scrapes a user's games and collects all the meta
  140. * data for it.
  141. */
  142. gameData: function(user, games, browser, callback) {
  143.  
  144. var gameData = {};
  145. var gameIds = [];
  146. var i;
  147.  
  148. for (i = 0; i < games.length; i++) {
  149. gameData[games[i].versionId] = games[i];
  150. gameIds.push(games[i].versionId);
  151. }
  152.  
  153. function scrape() {
  154.  
  155. var gameId = gameIds.shift();
  156. var game = gameData[gameId];
  157. var jobs = [pricing, text, general, releaseNotes, contentRatings, media];
  158.  
  159. setLocation(browser, user, "/application/general/" + gameId + "/detail.html", function(error, browser) {
  160.  
  161. if (error) {
  162. console.log("[gameList] error setting initial page", error, browser.error);
  163. return callback(error || browser.error || "Missing page body");
  164. }
  165.  
  166. next();
  167. });
  168.  
  169. function done() {
  170.  
  171. if (jobs.length === 0) {
  172.  
  173. // no more games
  174. if (gameIds.length === 0) {
  175.  
  176. games = [];
  177.  
  178. for (var game in gameData) {
  179. games.push(game);
  180. }
  181.  
  182. return callback(null, browser, games);
  183. }
  184.  
  185. // next game
  186. return setTimeout(scrape, sleep());
  187. }
  188.  
  189. // next job
  190. return setTimeout(next, sleep());
  191. }
  192.  
  193. function next() {
  194. return jobs.shift()(gameId, game, done);
  195. }
  196. }
  197.  
  198. scrape();
  199.  
  200. // pricing data
  201. function pricing(gameId, game, done) {
  202.  
  203. var retries = 0;
  204. browser.clickLink("#header_nav_availability_pricing_a", process);
  205.  
  206. function retry() {
  207. browser.reload(process);
  208. }
  209.  
  210. function process(error) {
  211.  
  212. if (error) {
  213. console.log("[gameData.pricing] error processing page", error, browser.error);
  214. retries++;
  215.  
  216. if (retries < 10) {
  217. console.log("[gameData.pricing] retrying page");
  218. return tryAgain(retry, retries);
  219. } else {
  220. return callback(error || browser.error);
  221. }
  222. }
  223.  
  224. game.pricing = { };
  225.  
  226. var prow = browser.query("#ro_cal_prices");
  227. var pValues = browser.queryAll("span[class=pricing-val]", prow);
  228. var pCurrencies = browser.queryAll("span[class=pricing-currency]", prow);
  229.  
  230. for (i = 0; i < pValues.length; i++) {
  231.  
  232. var curr = pCurrencies[i].innerHTML.trim();
  233. var price = pValues[i].innerHTML.trim().substring(1);
  234. game.pricing[curr] = price;
  235. }
  236.  
  237. // TODO: availability
  238.  
  239. done();
  240. }
  241. }
  242.  
  243. // text data
  244. function text(gameId, game, done) {
  245.  
  246. var retries = 0;
  247. browser.clickLink("#header_nav_description_a", process);
  248.  
  249. function retry() {
  250. browser.reload(process);
  251. }
  252.  
  253. function process(error) {
  254.  
  255. if (error) {
  256. console.log("[gameData.text] error processing page", error, browser.error);
  257. retries++;
  258.  
  259. if (retries < 10) {
  260. console.log("[gameData.pricing] retrying page");
  261. return tryAgain(retry, retries);
  262. } else {
  263. return callback(error || browser.error);
  264. }
  265. }
  266.  
  267. // get the languages
  268. var langList = browser.query("#collectable_nav_list");
  269. var langLinks = browser.queryAll("a", langList);
  270. var langArr = [];
  271. game.languages = {};
  272.  
  273. for (i = 0; i < langLinks.length; i++) {
  274. var lName = langLinks[i].innerHTML;
  275. var lid = langLinks[i].href.replace("https://developer.amazon.com/application/description/", "")
  276. .replace("/detail.html", "");
  277. game.languages[lid] = lName;
  278. langArr.push(lid);
  279. }
  280.  
  281. var currLang = browser.query("span", langList).innerHTML.trim();
  282.  
  283. game.languages[gameId] = {
  284. language: currLang,
  285. title: browser.query("#ro_display_title").innerHTML.trim(),
  286. shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  287. longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  288. features: browser.query("#ro_bullets").innerHTML.trim(),
  289. keywords: browser.query("#ro_keywords").innerHTML.trim()
  290. };
  291.  
  292. function scrapeLanguage() {
  293. var langId = langArr.shift();
  294.  
  295. browser.visit("/application/description/" + langId + "/detail.html", function(error) {
  296.  
  297. if (error || browser.error) {
  298. return callback(error || browser.error);
  299. }
  300.  
  301. game.languages[langId] = {
  302. language: currLang,
  303. title: browser.query("#ro_display_title").innerHTML.trim(),
  304. shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  305. longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  306. features: browser.query("#ro_bullets").innerHTML.trim(),
  307. keywords: browser.query("#ro_keywords").innerHTML.trim()
  308. };
  309.  
  310. if (langArr.length > 0) {
  311. return setTimeout(scrapeLanguage, 500);
  312. }
  313.  
  314. return done();
  315. });
  316. }
  317. }
  318. }
  319.  
  320. // general data
  321. function general(gameId, game, done) {
  322.  
  323. browser.visit("/application/general/" + gameId + "/detail.html", function(error) {
  324.  
  325. if (error || browser.error) {
  326. return callback(error || browser.error);
  327. }
  328.  
  329. game.title = browser.query("#ro_title").innerHTML.trim();
  330. game.sku = browser.query("#ro_sku").innerHTML.trim();
  331. game.privacy = browser.query("#ro_privacy_policy_url").innerHTLM.trim();
  332. game.categories = browser.query("#ro_category").innerHTML.trim().split(" - ");
  333. game.support = {
  334. email: browser.query("#ro_support_email").innerHTLM.trim(),
  335. phone: browser.query("#ro_support_phone").innerHTLM.trim(),
  336. website: browser.query("#ro_support_website").innerHTLM.trim(),
  337. };
  338.  
  339. // app key doesn't have an ID on the td
  340. var tds = browser.queryAll("td", browser.query("#ro_title").parentNode);
  341.  
  342. for (i = 0; i < tds.length; i++) {
  343. if (tds[i].innerHTML.indexOf("Application Key") > -1) {
  344. game.appKey = tds[i + 1].innerHTLM.trim();
  345. break;
  346. }
  347. }
  348.  
  349. done();
  350. });
  351. }
  352.  
  353. function releaseNotes(gameId, game, done) {
  354.  
  355. browser.visit("/application/releasenotes/" + gameId + "/detail.html?default", function(error) {
  356.  
  357. if (error || browser.error) {
  358. return callback(error || browser.error);
  359. }
  360.  
  361. // get the languages
  362. var langList = browser.query("#collectable_nav_list");
  363. var lankLinks = browser.queryAll("a", langList);
  364. var langArr = [];
  365. game.languages = {};
  366.  
  367. for (i = 0; i < languages.length; i++) {
  368. var lName = languages[i].innerHTML;
  369. var lid = languages[i].href.replace("https://developer.amazon.com/application/releasenotes/", "").replace("/detail.html", "");
  370. languages[lid] = lName;
  371. langArr.push(lid);
  372. }
  373.  
  374. var currLang = browser.query("span", langList).innerHTML.trim();
  375.  
  376. game.languages[gameId] = {
  377. language: currLang,
  378. title: browser.query("#ro_display_title").innerHTML.trim(),
  379. shortDesc: browser.query("#ro_short_desc").innerHTML.trim(),
  380. longDesc: browser.query("#ro_long_desc").innerHTML.trim(),
  381. features: browser.query("#ro_bullets").innerHTML.trim(),
  382. keywords: browser.query("#ro_keywords").innerHTML.trim()
  383. };
  384.  
  385. function scrapeLanguage() {
  386. var langId = langArr.shift();
  387.  
  388. browser.visit("/application/releasenotes/" + langId + "/detail.html", function(error) {
  389.  
  390. if (error || browser.error) {
  391. return callback(error || browser.error);
  392. }
  393.  
  394. game.languages[langId] = {
  395. language: currLang,
  396. releaseNotes: browser.query("#ro_release_notes").innerHTML.trim()
  397. };
  398.  
  399. if (langArr.length > 0) {
  400. return setTimeout(scrapeLanguage, 500);
  401. }
  402.  
  403. return done();
  404. });
  405. }
  406. });
  407. }
  408.  
  409. function contentRatings(gameId, game, done) {
  410.  
  411. browser.visit("/application/rating/" + gameId + "/detail.html", function(error) {
  412.  
  413. if (error || browser.error) {
  414. return callback(error || browser.error);
  415. }
  416.  
  417. game.contentRatings = {
  418. alcohol: browser.query("#ro_maturity_field_2").innerHTML.trim(),
  419. cartoonViolence: browser.query("#ro_maturity_field_3").innerHTML.trim(),
  420. intolerance: browser.query("#ro_maturity_field_4").innerHTML.trim(),
  421. nudity: browser.query("#ro_maturity_field_7").innerHTML.trim(),
  422. profanity: browser.query("#ro_maturity_field_8").innerHTML.trim(),
  423. realisticViolence: browser.query("#ro_maturity_field_9").innerHTML.trim(),
  424. sex: browser.query("#ro_maturity_field_10").innerHTML.trim()
  425. };
  426.  
  427. // other ratings
  428. var others = browser.query("#ro_maturity_boolean_flags").innerHTML.trim();
  429. game.contentRatings.others = {
  430. pii: others.indexOf("Account creation or other personal information collection") > -1,
  431. ads: others.indexOf("Advertisements") > -1,
  432. gambling: others.indexOf("Gambling") > -1,
  433. location: others.indexOf("Location detection or Location Based Services") > -1,
  434. userContent: others.indexOf("User Generated Content or User to User Communication") > -1
  435. };
  436.  
  437. done();
  438. });
  439. }
  440.  
  441. function media(gameId, game, done) {
  442. done();
  443. }
  444. }
  445. };
  446.  
  447. /**
  448. * Sets the browser location to the necessary page
  449. */
  450. function setLocation(browser, user, url, callback, retries) {
  451.  
  452. console.log("[setLocation] setting location", url, retries);
  453.  
  454. retries = retries || 0;
  455.  
  456. function retry() {
  457. setLocation(browser, user, url, callback, retries);
  458. }
  459.  
  460. // get an authenticated session if we're not provided one
  461. if (!browser) {
  462.  
  463. return amazon.authenticate(user, function(error, browser) {
  464.  
  465. if (error || browserError(browser)) {
  466. retries++;
  467.  
  468. if (retries < 10) {
  469. return tryAgain(retry, retries);
  470. } else {
  471. return callback(error || browser.error || "No document body");
  472. }
  473. }
  474.  
  475. return setLocation(browser, user, url, callback);
  476. });
  477. }
  478.  
  479. browser.visit(url, function(error) {
  480.  
  481. if (error || browserError(browser)) {
  482. console.log("[setLocation] error visiting page", error, browser.error);
  483. retries++;
  484.  
  485. if (retries < 10) {
  486. console.log("[setLocation] retrying");
  487. return tryAgain(retry, retries);
  488. }
  489.  
  490. return callback(error || browser.error || "No document body", browser);
  491. }
  492.  
  493. callback(null, browser);
  494. });
  495. }
  496.  
  497. /**
  498. * A sleep timer for some period of time for use between
  499. * pageviews
  500. */
  501. function sleep(min) {
  502. return (1000 || min) + Math.floor(Math.random() * 2000);
  503. }
  504.  
  505. function tryAgain(func, numRetries) {
  506. return setTimeout(func, sleep(numRetries * 1000));
  507. }
  508.  
  509. /**
  510. * Catches discrete errors where the document doesn't actually
  511. * load but an exception isn't thrown until you try and access
  512. * the body.
  513. */
  514. function browserError(browser) {
  515.  
  516. if (browser === null) {
  517. return true;
  518. }
  519.  
  520. try {
  521. browser.queryAll("a");
  522. return false;
  523. } catch ( s ) {
  524. return true;
  525. }
  526. }
  527.  
  528. // get the game list
  529. amazon.gameList(testUser, null, function(error, browser, games) {
  530.  
  531. if (error) {
  532. return console.log("[amazon.gameList error]", error);
  533. }
  534.  
  535. // add data
  536. amazon.gameData(testUser, games, browser, function(error, browser, games) {
  537.  
  538. if (error) {
  539. return console.log("[amazon.gameData error]", error);
  540. }
  541.  
  542. console.log(JSON.stringify(games[0]));
  543. });
  544. });
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement