Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- this is my route handler, my crawler is only scraping a single url which directly hits this route and it should shut down
- ```ts
- async ({ page, request, browserController }) => {
- const browser = browserController.browser;
- const { label, ...companyData } = request.userData;
- const body = page.locator('body');
- const headerEL = body.locator('header');
- const mainDiv = headerEL.locator('.mx-1');
- const pTag = mainDiv.locator('p');
- let pTagContent = '';
- if ((await pTag.count()) > 0) {
- (await pTag.allTextContents()).map(
- (content) => (pTagContent += content + '\n')
- );
- } else {
- pTagContent = (await pTag.textContent()) || '';
- }
- const companyAbout = pTagContent || '';
- let companyID = '';
- if (companyData['company_id']) {
- companyID = new URL(companyData['company_id'], request.loadedUrl).href;
- } else {
- companyID = request.loadedUrl || '';
- }
- let companyName = '';
- if (companyData['company_name']) {
- companyName = companyData['company_name'];
- } else {
- companyName = '';
- }
- let companyOthersData = '';
- const company = createCompanyDocument({
- ...companyData,
- resource: state.resource,
- company_name: companyName,
- company_id: companyID,
- about_company: companyAbout,
- });
- let salaryLink = '';
- try {
- const salaryLinkTag = page.getByRole('link', { name: 'salary' });
- salaryLink = (await salaryLinkTag.getAttribute('href')) || '';
- } catch (error) {
- log.error(`salary link for company ${companyID} not found`);
- }
- //TODO future optimization: try using sendRequest api
- if (salaryLink) {
- const newPage = await browser.newPage();
- await newPage.goto(new URL(salaryLink, request.loadedUrl).href);
- const infoDataTag = newPage.locator('.py-5.text-center');
- const infoDataTag2 = await infoDataTag
- .locator('h1')
- .filter({ hasNot: infoDataTag.locator('h1') })
- .count();
- log.debug(`count ${infoDataTag2}`);
- try {
- await expect(infoDataTag).toBeAttached();
- } catch (error: any) {
- log.error(
- `Info tag for Salary link ${
- new URL(salaryLink, request.loadedUrl).href
- } not found`,
- error
- );
- }
- const infoData = await infoDataTag.getAttribute('innerText');
- if (!infoData) {
- log.error(
- `info data tag for company url ${companyID} - salary url ${salaryLink} does not exist`
- );
- } else {
- companyOthersData = infoData;
- }
- await newPage.close();
- }
- company.others = companyOthersData;
- log.debug('Scraped Company', company);
- }
- ```
Advertisement
Add Comment
Please, Sign In to add comment