Guest User

Untitled

a guest
Dec 7th, 2023
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. this is my route handler, my crawler is only scraping a single url which directly hits this route and it should shut down
  2. ```ts
  3.  
  4.        async ({ page, request, browserController }) => {
  5.     const browser = browserController.browser;
  6.     const { label, ...companyData } = request.userData;
  7.     const body = page.locator('body');
  8.     const headerEL = body.locator('header');
  9.     const mainDiv = headerEL.locator('.mx-1');
  10.     const pTag = mainDiv.locator('p');
  11.     let pTagContent = '';
  12.     if ((await pTag.count()) > 0) {
  13.         (await pTag.allTextContents()).map(
  14.             (content) => (pTagContent += content + '\n')
  15.         );
  16.     } else {
  17.         pTagContent = (await pTag.textContent()) || '';
  18.     }
  19.     const companyAbout = pTagContent || '';
  20.  
  21.     let companyID = '';
  22.     if (companyData['company_id']) {
  23.         companyID = new URL(companyData['company_id'], request.loadedUrl).href;
  24.     } else {
  25.         companyID = request.loadedUrl || '';
  26.     }
  27.     let companyName = '';
  28.     if (companyData['company_name']) {
  29.         companyName = companyData['company_name'];
  30.     } else {
  31.         companyName = '';
  32.     }
  33.     let companyOthersData = '';
  34.     const company = createCompanyDocument({
  35.         ...companyData,
  36.         resource: state.resource,
  37.         company_name: companyName,
  38.         company_id: companyID,
  39.         about_company: companyAbout,
  40.     });
  41.  
  42.     let salaryLink = '';
  43.     try {
  44.         const salaryLinkTag = page.getByRole('link', { name: 'salary' });
  45.         salaryLink = (await salaryLinkTag.getAttribute('href')) || '';
  46.     } catch (error) {
  47.         log.error(`salary link for company ${companyID} not found`);
  48.     }
  49.     //TODO future optimization: try using sendRequest api
  50.     if (salaryLink) {
  51.         const newPage = await browser.newPage();
  52.         await newPage.goto(new URL(salaryLink, request.loadedUrl).href);
  53.         const infoDataTag = newPage.locator('.py-5.text-center');
  54.         const infoDataTag2 = await infoDataTag
  55.             .locator('h1')
  56.             .filter({ hasNot: infoDataTag.locator('h1') })
  57.             .count();
  58.  
  59.         log.debug(`count ${infoDataTag2}`);
  60.         try {
  61.             await expect(infoDataTag).toBeAttached();
  62.         } catch (error: any) {
  63.             log.error(
  64.                 `Info tag for Salary link ${
  65.                     new URL(salaryLink, request.loadedUrl).href
  66.                 } not found`,
  67.                 error
  68.             );
  69.         }
  70.  
  71.         const infoData = await infoDataTag.getAttribute('innerText');
  72.         if (!infoData) {
  73.             log.error(
  74.                 `info data tag for company url ${companyID} - salary url ${salaryLink} does not exist`
  75.             );
  76.         } else {
  77.             companyOthersData = infoData;
  78.         }
  79.         await newPage.close();
  80.     }
  81.     company.others = companyOthersData;
  82.     log.debug('Scraped Company', company);
  83. }
  84. ```
Advertisement
Add Comment
Please, Sign In to add comment