Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- const puppeteer = require('puppeteer');
- const util = require('util');
- const exec = util.promisify(require('child_process').exec);
- const changeProxy = () => {
- (async () => {
- const { stdout, stderr } = await exec("nc64 localhost 9151 <tor-change.txt");
- console.log('stdout:', stdout);
- })();
- return;
- }
- const preparePageForTests = async (page) => {
- // Pass the User-Agent Test.
- const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)' +
- 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
- await page.setUserAgent(userAgent);
- // Pass the Webdriver Test.
- await page.evaluateOnNewDocument(() => {
- Object.defineProperty(navigator, 'webdriver', {
- get: () => false,
- });
- });
- // Pass the Chrome Test.
- await page.evaluateOnNewDocument(() => {
- // We can mock this in as much depth as we need for the test.
- window.navigator.chrome = {
- runtime: {},
- // etc.
- };
- });
- // Pass the Permissions Test.
- await page.evaluateOnNewDocument(() => {
- const originalQuery = window.navigator.permissions.query;
- return window.navigator.permissions.query = (parameters) => (
- parameters.name === 'notifications' ?
- Promise.resolve({ state: Notification.permission }) :
- originalQuery(parameters)
- );
- });
- // Pass the Plugins Length Test.
- await page.evaluateOnNewDocument(() => {
- // Overwrite the `plugins` property to use a custom getter.
- Object.defineProperty(navigator, 'plugins', {
- // This just needs to have `length > 0` for the current test,
- // but we could mock the plugins too if necessary.
- get: () => [1, 2, 3, 4, 5],
- });
- });
- // Pass the Languages Test.
- await page.evaluateOnNewDocument(() => {
- // Overwrite the `plugins` property to use a custom getter.
- Object.defineProperty(navigator, 'languages', {
- get: () => ['en-US', 'en'],
- });
- });
- }
- const blockImages = async (page) => {
- await page.setRequestInterception(true);
- page.on('request', request => {
- if (request.resourceType() === 'image')
- request.abort();
- else
- request.continue();
- });
- return;
- }
- const scrape = async (pageURL, resolve) => {
- const resolutions = [[1366, 768], [1920, 1080], [1536, 864]]
- const priceRegEx = /(\$)(\d{1,4}\.\d{1,2})$/;
- const priceStringSelector = ".form-type-radio";
- const browser = await puppeteer.launch({ headless: false, args: ['--proxy-server=socks5://localhost:9150'] });
- const page = await browser.newPage();
- await preparePageForTests(page);
- // resolutions
- let rndRes = Math.floor(Math.random() * resolutions.length);
- rndRes = resolutions[rndRes];
- await page.setViewport({ width: rndRes[0], height: rndRes[1] });
- await page.goto(pageURL, { waitUntil: 'domcontentloaded', referer: "" });
- try {
- await page.waitForSelector(priceStringSelector, {
- timeout: 3000
- })
- } catch (err) {
- throw err
- }
- const priceString = await page.evaluate((priceStringSelector) => {
- const price = document.querySelector(priceStringSelector);
- const priceText = price.innerText;
- return priceText;
- }, priceStringSelector);
- const matchedPrice = priceString.match(priceRegEx);
- if(!matchedPrice){
- console.log(`price matching went wrong, alarm! The string was ${priceString}`);
- return;
- }
- await browser.close();
- resolve();
- }
- let pageURLs = ["https://nostarch.com/algorithmic-thinking", "https://nostarch.com/writegreatcode1_2e"];
- function scrapeThemAll(){
- const scrapeInProgress = new Promise((resolve, reject)=>{
- scrape(pageURLs[0], resolve);
- })
- scrapeInProgress.then(()=>{
- changeProxy();
- pageURLs.shift();
- if(pageURLs.length){
- scrapeThemAll();
- }
- })
- }
- scrapeThemAll();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement