Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- const fs = require('fs');
- const path = require('path');
- const { exec } = require('child_process');
- const { promisify } = require('util');
- const execAsync = promisify(exec);
- const sqlite3 = require('sqlite3').verbose();
- // Configuration
- const CONFIG = {
- baseUrl: 'https://www.linkedin.com/voyager/api/voyagerJobsDashJobCards',
- decorationId: 'com.linkedin.voyager.dash.deco.jobs.search.JobSearchCardsCollection-220',
- count: 100, // Number of results per request
- maxResultsPerFilter: 1000, // LinkedIn's limit per filter combination
- maxCallsPerMinute: 350, // Rate limit: 350 API calls per minute
- delayBetweenFilters: 5000, // 5 seconds delay between filter changes
- delayBetweenGeoIds: 10000, // 10 seconds delay between location changes
- dbPath: path.join(__dirname, 'data', 'linkedin_jobs-v3.db'),
- geoIdsFile: path.join(__dirname, 'european-geo-ids.json'),
- cookies: '<your_linkedIn_cookies_here>',
- csrfToken: '<your_linkedIn_csrf_token_here>'
- };
- // 184
- // Rate limiter state
- const rateLimiter = {
- callsThisMinute: 0,
- minuteStartTime: Date.now(),
- queue: []
- };
- // Filter combinations to maximize data collection
- // We'll rotate through these to get different sets of jobs
- const FILTER_COMBINATIONS = [
- // Workplace type variations (1:On-site, 2:Remote, 3:Hybrid)
- { workplace: '1', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'On-site jobs' },
- { workplace: '2', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'Remote jobs' },
- { workplace: '3', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'Hybrid jobs' },
- // Experience level variations (1:Internship, 2:Entry level, 3:Associate, 4:Mid-Senior, 5:Director, 6:Executive)
- { workplace: '', experience: '1', jobType: '', verified: '', applyWithLinkedin: '', description: 'Internship' },
- { workplace: '', experience: '2', jobType: '', verified: '', applyWithLinkedin: '', description: 'Entry level' },
- { workplace: '', experience: '3', jobType: '', verified: '', applyWithLinkedin: '', description: 'Associate' },
- { workplace: '', experience: '4', jobType: '', verified: '', applyWithLinkedin: '', description: 'Mid-Senior level' },
- { workplace: '', experience: '5', jobType: '', verified: '', applyWithLinkedin: '', description: 'Director' },
- { workplace: '', experience: '6', jobType: '', verified: '', applyWithLinkedin: '', description: 'Executive' },
- // Job type variations (F:Full-time, P:Part-time, C:Contract, T:Temporary, I:Internship, V:Volunteer, O:Other)
- { workplace: '', experience: '', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Full-time' },
- { workplace: '', experience: '', jobType: 'P', verified: '', applyWithLinkedin: '', description: 'Part-time' },
- { workplace: '', experience: '', jobType: 'C', verified: '', applyWithLinkedin: '', description: 'Contract' },
- { workplace: '', experience: '', jobType: 'T', verified: '', applyWithLinkedin: '', description: 'Temporary' },
- { workplace: '', experience: '', jobType: 'I', verified: '', applyWithLinkedin: '', description: 'Internship jobs' },
- // Special filters
- { workplace: '', experience: '', jobType: '', verified: ',verifiedJob:List(true)', applyWithLinkedin: '', description: 'Verified jobs only' },
- { workplace: '', experience: '', jobType: '', verified: '', applyWithLinkedin: ',applyWithLinkedin:List(true)', description: 'Easy Apply jobs' },
- // Combined filters for more granular results
- { workplace: '2', experience: '2', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Entry-level Full-time' },
- { workplace: '2', experience: '3', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Associate Full-time' },
- { workplace: '2', experience: '4', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Mid-Senior Full-time' },
- { workplace: '1', experience: '2', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Entry-level Full-time' },
- { workplace: '1', experience: '3', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Associate Full-time' },
- { workplace: '1', experience: '4', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Mid-Senior Full-time' },
- ];
- // Database instance
- let db;
- // Initialize database
- function initDatabase() {
- return new Promise((resolve, reject) => {
- // Create data directory if it doesn't exist
- const dataDir = path.dirname(CONFIG.dbPath);
- if (!fs.existsSync(dataDir)) {
- fs.mkdirSync(dataDir, { recursive: true });
- }
- db = new sqlite3.Database(CONFIG.dbPath, (err) => {
- if (err) {
- reject(err);
- } else {
- console.log('Connected to SQLite database');
- // Read and execute schema
- const schemaPath = path.join(__dirname, 'schema.sql');
- if (fs.existsSync(schemaPath)) {
- const schema = fs.readFileSync(schemaPath, 'utf8');
- db.exec(schema, (err) => {
- if (err) {
- reject(err);
- } else {
- console.log('Database schema initialized');
- resolve();
- }
- });
- } else {
- resolve();
- }
- }
- });
- });
- }
- // Sleep function
- function sleep(ms) {
- return new Promise(resolve => setTimeout(resolve, ms));
- }
- // Rate limiter - ensures we don't exceed maxCallsPerMinute
- async function waitForRateLimit() {
- const now = Date.now();
- const elapsed = now - rateLimiter.minuteStartTime;
- // Reset counter if a minute has passed
- if (elapsed >= 60000) {
- rateLimiter.callsThisMinute = 0;
- rateLimiter.minuteStartTime = now;
- return;
- }
- // If we've hit the limit, wait until the next minute
- if (rateLimiter.callsThisMinute >= CONFIG.maxCallsPerMinute) {
- const timeToWait = 60000 - elapsed;
- console.log(` ⏱ Rate limit reached (${CONFIG.maxCallsPerMinute} calls/min). Waiting ${Math.ceil(timeToWait / 1000)}s...`);
- await sleep(timeToWait);
- rateLimiter.callsThisMinute = 0;
- rateLimiter.minuteStartTime = Date.now();
- }
- rateLimiter.callsThisMinute++;
- }
- // Execute requests continuously with streaming processing (with rate limit)
- async function executeWithStreamingRateLimit(asyncFunctions, onResultCallback) {
- const totalRequests = asyncFunctions.length;
- let completedRequests = 0;
- let requestIndex = 0;
- const maxConcurrent = 220; // Increased concurrent requests for faster execution
- const activePromises = new Set();
- async function executeRequest(fn, index) {
- try {
- // Wait for rate limit before executing
- await waitForRateLimit();
- // Execute the actual fetch
- const result = await fn();
- completedRequests++;
- // Process result immediately
- if (onResultCallback) {
- await onResultCallback(result, completedRequests, totalRequests);
- }
- } catch (err) {
- completedRequests++;
- console.error(` ✗ Request ${index + 1} failed:`, err.message);
- }
- }
- // Start and maintain pool of concurrent requests
- while (requestIndex < totalRequests || activePromises.size > 0) {
- // Fill up to maxConcurrent
- while (activePromises.size < maxConcurrent && requestIndex < totalRequests) {
- const currentIndex = requestIndex++;
- const fn = asyncFunctions[currentIndex];
- const promise = executeRequest(fn, currentIndex);
- activePromises.add(promise);
- // Remove from set when complete
- promise.finally(() => {
- activePromises.delete(promise);
- });
- }
- // Wait for at least one request to complete before continuing
- if (activePromises.size > 0) {
- await Promise.race(activePromises);
- }
- }
- }
- // Helper function to resolve URN references
- function resolveUrn(urn, included) {
- if (!urn) return null;
- const entity = included.find(item => item.entityUrn === urn);
- return entity || null;
- }
- // Helper function to extract ID from URN
- function extractId(urn) {
- if (!urn) return null;
- const match = urn.match(/(\d+)/);
- return match ? match[1] : null;
- }
- // Clean invalid JSON control characters
- function cleanJSON(rawData) {
- return rawData.replace(/[\x00-\x1F\x7F-\x9F]/g, (char) => {
- if (char === '\t' || char === '\n' || char === '\r') {
- return char;
- }
- return '';
- });
- }
- // Parse job listing cards
- function parseJobListing(responseData) {
- const jobs = [];
- const included = responseData.included || [];
- const jobCards = included.filter(item =>
- item.$type === 'com.linkedin.voyager.dash.jobs.JobPostingCard' &&
- item.entityUrn && item.entityUrn.includes('JOBS_SEARCH')
- );
- jobCards.forEach(card => {
- const jobData = {
- id: extractId(card.jobPostingUrn || card.entityUrn),
- entityUrn: card.jobPostingUrn,
- cardEntityUrn: card.entityUrn,
- title: card.title?.text || card.jobPostingTitle || 'N/A',
- company: card.primaryDescription?.text || 'N/A',
- location: card.secondaryDescription?.text || 'N/A',
- trackingId: card.trackingId,
- referenceId: card.referenceId,
- verified: card.title?.accessibilityText?.includes('verification') || false,
- companyId: null,
- companyUrn: null,
- companyLogoAlt: null
- };
- if (card.footerItems && card.footerItems.length > 0) {
- const footerData = {};
- card.footerItems.forEach(item => {
- if (item.type === 'LISTED_DATE' && item.timeAt) {
- footerData.listedAt = new Date(item.timeAt).toISOString();
- footerData.listedAtTimestamp = item.timeAt;
- } else if (item.type === 'EASY_APPLY_TEXT') {
- footerData.easyApply = true;
- } else if (item.text) {
- footerData[item.type] = item.text.text;
- }
- });
- jobData.footer = footerData;
- }
- if (card.logo?.attributes?.[0]) {
- const logoAttr = card.logo.attributes[0];
- const companyLogoUrn = logoAttr.detailDataUnion?.companyLogo || logoAttr.detailData?.['*companyLogo'];
- if (companyLogoUrn) {
- jobData.companyId = extractId(companyLogoUrn);
- jobData.companyUrn = companyLogoUrn;
- }
- if (card.logo.accessibilityText) {
- jobData.companyLogoAlt = card.logo.accessibilityText;
- }
- }
- const jobPostingUrn = card['*jobPosting'] || card.jobPostingUrn;
- if (jobPostingUrn) {
- const jobPosting = resolveUrn(jobPostingUrn, included);
- if (jobPosting) {
- jobData.details = {
- title: jobPosting.title,
- trackingUrn: jobPosting.trackingUrn,
- repostedJob: jobPosting.repostedJob,
- contentSource: jobPosting.contentSource,
- posterId: jobPosting.posterId
- };
- }
- }
- if (card.jobInsightsV2) {
- jobData.insights = card.jobInsightsV2;
- }
- if (card.tertiaryDescription?.text) {
- jobData.tertiaryInfo = card.tertiaryDescription.text;
- }
- jobs.push(jobData);
- });
- return jobs;
- }
- // Fetch jobs for a specific offset and filter
- async function fetchJobsAtOffset(start, geoId, filter) {
- const { workplace, experience, jobType, verified, applyWithLinkedin } = filter;
- // Build filter string
- let filterStr = '';
- if (workplace) filterStr += `,workplaceType:List(${workplace})`;
- if (experience) filterStr += `,experience:List(${experience})`;
- if (jobType) filterStr += `,jobType:List(${jobType})`;
- filterStr += verified;
- filterStr += applyWithLinkedin;
- const url = `${CONFIG.baseUrl}?decorationId=${CONFIG.decorationId}&count=${CONFIG.count}&q=jobSearch&query=(origin:JOB_SEARCH_PAGE_JOB_FILTER,locationUnion:(geoId:${geoId}),selectedFilters:(timePostedRange:List(r2592000)${filterStr}),spellCorrectionEnabled:true)&start=${start}`;
- const curlCommand = `curl -s '${url}' \
- -H 'accept: application/vnd.linkedin.normalized+json+2.1' \
- -H 'accept-language: en-US,en-GB;q=0.9,en;q=0.8' \
- -H 'cache-control: no-cache' \
- -b '${CONFIG.cookies}' \
- -H 'csrf-token: ${CONFIG.csrfToken}' \
- -H 'dnt: 1' \
- -H 'pragma: no-cache' \
- -H 'priority: u=1, i' \
- -H 'referer: https://www.linkedin.com/jobs/search/?currentJobId=4329808219&f_TPR=r2592000&geoId=91000000&origin=JOB_SEARCH_PAGE_JOB_FILTER' \
- -H 'sec-ch-prefers-color-scheme: dark' \
- -H 'sec-ch-ua: "Google Chrome";v="143", "Chromium";v="143", "Not A(Brand";v=24"' \
- -H 'sec-ch-ua-mobile: ?0' \
- -H 'sec-ch-ua-platform: "macOS"' \
- -H 'sec-fetch-dest: empty' \
- -H 'sec-fetch-mode: cors' \
- -H 'sec-fetch-site: same-origin' \
- -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36' \
- -H 'x-li-lang: en_US' \
- -H 'x-li-page-instance: urn:li:page:d_flagship3_search_srp_jobs;PSIhsQSvS/Omd2tT0L8VFw==' \
- -H 'x-li-pem-metadata: Voyager - Careers - Jobs Search=jobs-search-results,Voyager - Careers - Critical - careers-api=jobs-search-results' \
- -H 'x-li-prefetch: 1' \
- -H 'x-li-track: {"clientVersion":"1.13.41695","mpVersion":"1.13.41695","osName":"web","timezoneOffset":1,"timezone":"Africa/Tunis","deviceFormFactor":"DESKTOP","mpName":"voyager-web","displayDensity":2,"displayWidth":2940,"displayHeight":1912}' \
- -H 'x-restli-protocol-version: 2.0.0'`;
- try {
- const { stdout: rawResponse } = await execAsync(curlCommand, {
- encoding: 'utf8',
- maxBuffer: 10 * 1024 * 1024
- });
- const cleanedResponse = cleanJSON(rawResponse);
- const data = JSON.parse(cleanedResponse);
- return data;
- } catch (error) {
- console.error(` ✗ Error fetching jobs at offset ${start}:`, error.message);
- return null;
- }
- }
- // Insert location into database
- function insertLocation(location) {
- return new Promise((resolve, reject) => {
- const stmt = db.prepare(`
- INSERT OR IGNORE INTO locations (geo_id, geo_urn, name, country_code, search_keyword)
- VALUES (?, ?, ?, ?, ?)
- `);
- stmt.run(location.geoId, location.geoUrn, location.name, location.countryCode, location.searchKeyword, (err) => {
- if (err) reject(err);
- else resolve();
- });
- stmt.finalize();
- });
- }
- // Insert company into database (with duplicate check)
- function insertCompany(job) {
- return new Promise((resolve, reject) => {
- if (!job.companyId) {
- resolve();
- return;
- }
- // Check if company already exists
- const checkStmt = db.prepare('SELECT company_id FROM companies WHERE company_id = ?');
- checkStmt.get(job.companyId, (err, row) => {
- checkStmt.finalize();
- if (err) {
- reject(err);
- return;
- }
- if (row) {
- // Company exists, just update the updated_at timestamp
- const updateStmt = db.prepare(`
- UPDATE companies
- SET company_name = ?, company_logo_alt = ?, updated_at = CURRENT_TIMESTAMP
- WHERE company_id = ?
- `);
- updateStmt.run(job.company, job.companyLogoAlt, job.companyId, (err) => {
- if (err) reject(err);
- else resolve();
- });
- updateStmt.finalize();
- } else {
- // Insert new company
- const insertStmt = db.prepare(`
- INSERT INTO companies (company_id, company_urn, company_name, company_logo_alt)
- VALUES (?, ?, ?, ?)
- `);
- insertStmt.run(job.companyId, job.companyUrn, job.company, job.companyLogoAlt, (err) => {
- if (err) reject(err);
- else resolve();
- });
- insertStmt.finalize();
- }
- });
- });
- }
- // Insert job into database (with duplicate check)
- function insertJob(job, geoId, runId, filter) {
- return new Promise((resolve, reject) => {
- // Check if job already exists
- const checkStmt = db.prepare('SELECT id FROM jobs WHERE id = ?');
- checkStmt.get(job.id, (err, row) => {
- checkStmt.finalize();
- if (err) {
- reject(err);
- return;
- }
- const isUpdate = !!row;
- // Insert or update job
- const jobStmt = db.prepare(isUpdate ? `
- UPDATE jobs SET
- entity_urn = ?, card_entity_urn = ?, title = ?, company = ?, company_id = ?,
- location = ?, tracking_id = ?, reference_id = ?, verified = ?, easy_apply = ?,
- listed_at = ?, listed_at_timestamp = ?, content_source = ?, poster_id = ?,
- reposted_job = ?, company_logo_alt = ?, tertiary_info = ?, updated_at = CURRENT_TIMESTAMP
- WHERE id = ?
- ` : `
- INSERT INTO jobs (
- id, entity_urn, card_entity_urn, title, company, company_id,
- location, tracking_id, reference_id, verified, easy_apply,
- listed_at, listed_at_timestamp, content_source, poster_id,
- reposted_job, company_logo_alt, tertiary_info
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
- `);
- const params = isUpdate ? [
- job.entityUrn,
- job.cardEntityUrn,
- job.title,
- job.company,
- job.companyId || null,
- job.location,
- job.trackingId,
- job.referenceId,
- job.verified ? 1 : 0,
- job.footer?.easyApply ? 1 : 0,
- job.footer?.listedAt || null,
- job.footer?.listedAtTimestamp || null,
- job.details?.contentSource || null,
- job.details?.posterId || null,
- job.details?.repostedJob ? 1 : 0,
- job.companyLogoAlt || null,
- job.tertiaryInfo || null,
- job.id // WHERE clause
- ] : [
- job.id,
- job.entityUrn,
- job.cardEntityUrn,
- job.title,
- job.company,
- job.companyId || null,
- job.location,
- job.trackingId,
- job.referenceId,
- job.verified ? 1 : 0,
- job.footer?.easyApply ? 1 : 0,
- job.footer?.listedAt || null,
- job.footer?.listedAtTimestamp || null,
- job.details?.contentSource || null,
- job.details?.posterId || null,
- job.details?.repostedJob ? 1 : 0,
- job.companyLogoAlt || null,
- job.tertiaryInfo || null
- ];
- jobStmt.run(...params, (jobErr) => {
- jobStmt.finalize();
- if (jobErr) {
- reject(jobErr);
- return;
- }
- // Check if job-location mapping exists
- const checkMapStmt = db.prepare('SELECT id FROM job_locations WHERE job_id = ? AND geo_id = ?');
- checkMapStmt.get(job.id, geoId, (mapCheckErr, mapRow) => {
- checkMapStmt.finalize();
- if (mapCheckErr) {
- reject(mapCheckErr);
- return;
- }
- if (!mapRow) {
- // Insert job-location mapping only if it doesn't exist
- const mapStmt = db.prepare(`
- INSERT INTO job_locations (job_id, geo_id)
- VALUES (?, ?)
- `);
- mapStmt.run(job.id, geoId, (mapErr) => {
- mapStmt.finalize();
- if (mapErr) {
- reject(mapErr);
- return;
- }
- insertJobFilter();
- });
- } else {
- insertJobFilter();
- }
- function insertJobFilter() {
- // Check if job-filter mapping exists
- const checkFilterStmt = db.prepare(
- 'SELECT id FROM job_filters WHERE job_id = ? AND scraping_run_id = ?'
- );
- checkFilterStmt.get(job.id, runId, (filterCheckErr, filterRow) => {
- checkFilterStmt.finalize();
- if (filterCheckErr) {
- reject(filterCheckErr);
- return;
- }
- if (!filterRow) {
- // Insert job-filter mapping only if it doesn't exist
- const filterStmt = db.prepare(`
- INSERT INTO job_filters (
- job_id, scraping_run_id, geo_id,
- workplace_type, experience_level, job_type,
- verified_only, apply_with_linkedin, filter_description
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
- `);
- filterStmt.run(
- job.id,
- runId,
- geoId,
- filter.workplace || null,
- filter.experience || null,
- filter.jobType || null,
- filter.verified ? 1 : 0,
- filter.applyWithLinkedin ? 1 : 0,
- filter.description,
- (filterErr) => {
- filterStmt.finalize();
- if (filterErr) reject(filterErr);
- else resolve();
- }
- );
- } else {
- // Already exists, just resolve
- resolve();
- }
- });
- }
- });
- });
- });
- });
- }
- // Create scraping run record
- function createScrapingRun(geoId, filter) {
- return new Promise((resolve, reject) => {
- const stmt = db.prepare(`
- INSERT INTO scraping_runs (
- geo_id, workplace_type, experience_level, job_type,
- verified_only, apply_with_linkedin, started_at, status
- ) VALUES (?, ?, ?, ?, ?, ?, datetime('now'), 'running')
- `);
- stmt.run(
- geoId,
- filter.workplace || null,
- filter.experience || null,
- filter.jobType || null,
- filter.verified ? 1 : 0,
- filter.applyWithLinkedin ? 1 : 0,
- function (err) {
- if (err) reject(err);
- else resolve(this.lastID);
- }
- );
- stmt.finalize();
- });
- }
- // Update scraping run record
- function updateScrapingRun(runId, stats) {
- return new Promise((resolve, reject) => {
- const stmt = db.prepare(`
- UPDATE scraping_runs SET
- total_jobs_fetched = ?,
- successful_requests = ?,
- failed_requests = ?,
- completed_at = datetime('now'),
- status = ?
- WHERE id = ?
- `);
- stmt.run(
- stats.totalJobs,
- stats.successfulRequests,
- stats.failedRequests,
- stats.status,
- runId,
- (err) => {
- if (err) reject(err);
- else resolve();
- }
- );
- stmt.finalize();
- });
- }
- // Fetch jobs for a specific filter combination and return metadata
- async function fetchJobsForFilter(geoId, filter, runId) {
- const totalRequests = Math.ceil(CONFIG.maxResultsPerFilter / CONFIG.count);
- // Create array of async fetch functions for this filter
- const fetchFunctions = [];
- for (let i = 0; i < totalRequests; i++) {
- const start = i * CONFIG.count;
- fetchFunctions.push({
- fn: async () => ({
- offset: start,
- index: i,
- filter: filter,
- runId: runId,
- geoId: geoId,
- data: await fetchJobsAtOffset(start, geoId, filter)
- }),
- filterDescription: filter.description
- });
- }
- return fetchFunctions;
- }
- // Main function
- async function fetchAllJobs() {
- console.log('='.repeat(100));
- console.log('LinkedIn Jobs Scraper - Database Version');
- console.log('='.repeat(100));
- console.log();
- // Initialize database
- await initDatabase();
- // Load geo IDs
- const geoData = JSON.parse(fs.readFileSync(CONFIG.geoIdsFile, 'utf8'));
- const locations = geoData.locations;
- console.log(`Loaded ${locations.length} locations from ${CONFIG.geoIdsFile}`);
- console.log(`Will use ${FILTER_COMBINATIONS.length} filter combinations per location`);
- console.log();
- // Parse command-line argument for start index
- const startIndex = parseInt(process.argv[2]) || 0;
- let totalJobsCollected = 0;
- if (startIndex > 0) {
- console.log(`⚠️ Resuming from index ${startIndex}`);
- console.log();
- }
- const totalLocations = locations.length - startIndex;
- console.log(`Processing ${totalLocations} locations (${startIndex} → ${locations.length - 1})`);
- console.log();
- let totalLocationsProcessed = 0;
- // Process each location starting from resume index
- for (let i = startIndex; i < locations.length; i++) {
- const location = locations[i];
- totalLocationsProcessed++;
- console.log('='.repeat(100));
- console.log(`[${i}/${locations.length - 1}] ${location.name} (${location.countryCode || 'N/A'})`);
- console.log(`Geo ID: ${location.geoId} | Progress: ${totalLocationsProcessed}/${totalLocations} locations`);
- console.log('='.repeat(100));
- console.log();
- // Insert location into database
- await insertLocation(location);
- // Create scraping runs for all filters
- console.log(`Creating scraping runs for ${FILTER_COMBINATIONS.length} filters...`);
- const filterRunIds = [];
- for (const filter of FILTER_COMBINATIONS) {
- const runId = await createScrapingRun(location.geoId, filter);
- filterRunIds.push({ filter, runId });
- }
- // Collect ALL API calls across ALL filters
- console.log(`Preparing API calls for all filters...`);
- const allFetchFunctions = [];
- for (const { filter, runId } of filterRunIds) {
- const filterFetchFunctions = await fetchJobsForFilter(location.geoId, filter, runId);
- allFetchFunctions.push(...filterFetchFunctions.map(f => ({ ...f, runId })));
- }
- const totalApiCalls = allFetchFunctions.length;
- console.log(`Executing ${totalApiCalls} API calls across ${FILTER_COMBINATIONS.length} filters with streaming processing (220 concurrent requests, max 350/min)...`);
- console.log();
- // Track results by filter as they come in
- const resultsByFilter = {};
- const filterStats = {};
- // Initialize stats tracking
- for (const { filter } of filterRunIds) {
- resultsByFilter[filter.description] = [];
- filterStats[filter.description] = { completed: 0, total: 0, jobs: 0 };
- }
- // Count total requests per filter
- for (const f of allFetchFunctions) {
- filterStats[f.filterDescription].total++;
- }
- let totalJobsProcessed = 0;
- const startTime = Date.now();
- // Execute with streaming processing - results are processed as they arrive
- await executeWithStreamingRateLimit(
- allFetchFunctions.map(f => f.fn),
- async (result, completed, total) => {
- const filterDesc = result.filter.description;
- resultsByFilter[filterDesc].push(result);
- filterStats[filterDesc].completed++;
- // Process this individual result immediately
- if (result.data) {
- const jobs = parseJobListing(result.data);
- if (jobs.length > 0) {
- // Save to database immediately
- for (const job of jobs) {
- try {
- await insertCompany(job);
- await insertJob(job, result.geoId, result.runId, result.filter);
- totalJobsProcessed++;
- } catch (err) {
- // Silently continue on duplicate errors
- }
- }
- filterStats[filterDesc].jobs += jobs.length;
- }
- }
- // Show progress every 10 completed requests
- if (completed % 10 === 0 || completed === total) {
- const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
- const rate = (completed / (elapsed / 60)).toFixed(1);
- process.stdout.write(`\r Progress: ${completed}/${total} requests (${rate}/min) | ${totalJobsProcessed} jobs saved`);
- }
- }
- );
- const duration = ((Date.now() - startTime) / 1000).toFixed(1);
- console.log();
- console.log();
- console.log(`✓ All API calls completed in ${duration}s`);
- console.log();
- // Update scraping runs and show summary
- console.log('Filter Summary:');
- let locationJobsCount = 0;
- for (const { filter, runId } of filterRunIds) {
- const filterResults = resultsByFilter[filter.description] || [];
- const stats = filterStats[filter.description];
- // Count successful/failed requests
- let successful = 0, failed = 0;
- for (const r of filterResults) {
- if (r.data) successful++;
- else failed++;
- }
- // Update scraping run
- await updateScrapingRun(runId, {
- totalJobs: stats.jobs,
- successfulRequests: successful,
- failedRequests: failed,
- status: failed === filterResults.length ? 'failed' : 'completed'
- });
- locationJobsCount += stats.jobs;
- console.log(` ${filter.description.padEnd(40)} → ${stats.jobs} jobs (${successful}/${stats.total} successful)`);
- }
- totalJobsCollected += locationJobsCount;
- console.log();
- console.log(` ✓ Location complete: ${locationJobsCount} jobs collected`);
- console.log(` Running total: ${totalJobsCollected} jobs across ${totalLocationsProcessed} locations`);
- // Delay between locations
- if (i < locations.length - 1) {
- console.log(`\n Waiting ${CONFIG.delayBetweenGeoIds}ms before next location...`);
- await sleep(CONFIG.delayBetweenGeoIds);
- }
- }
- console.log();
- console.log('='.repeat(100));
- console.log('SCRAPING COMPLETE');
- console.log('='.repeat(100));
- console.log(`Total unique jobs collected: ${totalJobsCollected}`);
- console.log(`Total locations processed: ${totalLocationsProcessed}`);
- console.log(`Database: ${CONFIG.dbPath}`);
- console.log();
- console.log('All done! 🎉');
- // Close database
- db.close();
- }
- // Graceful shutdown handler
- // process.on('SIGINT', () => {
- // console.log('\n\n⚠️ Interrupt received!');
- // console.log(' To resume, run: node fetch-all-jobs-db.js <last_completed_index + 1>');
- // if (db) db.close();
- // process.exit(0);
- // });
- // Run the scraper
- fetchAllJobs().catch(error => {
- console.error('Fatal error:', error);
- if (db) db.close();
- process.exit(1);
- });
Advertisement
Add Comment
Please, Sign In to add comment