Guest User

Untitled

a guest
Jan 15th, 2026
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 33.45 KB | None | 0 0
  1. const fs = require('fs');
  2. const path = require('path');
  3. const { exec } = require('child_process');
  4. const { promisify } = require('util');
  5. const execAsync = promisify(exec);
  6. const sqlite3 = require('sqlite3').verbose();
  7.  
  8. // Configuration
  9. const CONFIG = {
  10. baseUrl: 'https://www.linkedin.com/voyager/api/voyagerJobsDashJobCards',
  11. decorationId: 'com.linkedin.voyager.dash.deco.jobs.search.JobSearchCardsCollection-220',
  12. count: 100, // Number of results per request
  13. maxResultsPerFilter: 1000, // LinkedIn's limit per filter combination
  14. maxCallsPerMinute: 350, // Rate limit: 350 API calls per minute
  15. delayBetweenFilters: 5000, // 5 seconds delay between filter changes
  16. delayBetweenGeoIds: 10000, // 10 seconds delay between location changes
  17. dbPath: path.join(__dirname, 'data', 'linkedin_jobs-v3.db'),
  18. geoIdsFile: path.join(__dirname, 'european-geo-ids.json'),
  19. cookies: '<your_linkedIn_cookies_here>',
  20. csrfToken: '<your_linkedIn_csrf_token_here>'
  21. };
  22. // 184
  23. // Rate limiter state
  24. const rateLimiter = {
  25. callsThisMinute: 0,
  26. minuteStartTime: Date.now(),
  27. queue: []
  28. };
  29.  
  30. // Filter combinations to maximize data collection
  31. // We'll rotate through these to get different sets of jobs
  32. const FILTER_COMBINATIONS = [
  33. // Workplace type variations (1:On-site, 2:Remote, 3:Hybrid)
  34. { workplace: '1', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'On-site jobs' },
  35. { workplace: '2', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'Remote jobs' },
  36. { workplace: '3', experience: '', jobType: '', verified: '', applyWithLinkedin: '', description: 'Hybrid jobs' },
  37.  
  38. // Experience level variations (1:Internship, 2:Entry level, 3:Associate, 4:Mid-Senior, 5:Director, 6:Executive)
  39. { workplace: '', experience: '1', jobType: '', verified: '', applyWithLinkedin: '', description: 'Internship' },
  40. { workplace: '', experience: '2', jobType: '', verified: '', applyWithLinkedin: '', description: 'Entry level' },
  41. { workplace: '', experience: '3', jobType: '', verified: '', applyWithLinkedin: '', description: 'Associate' },
  42. { workplace: '', experience: '4', jobType: '', verified: '', applyWithLinkedin: '', description: 'Mid-Senior level' },
  43. { workplace: '', experience: '5', jobType: '', verified: '', applyWithLinkedin: '', description: 'Director' },
  44. { workplace: '', experience: '6', jobType: '', verified: '', applyWithLinkedin: '', description: 'Executive' },
  45.  
  46. // Job type variations (F:Full-time, P:Part-time, C:Contract, T:Temporary, I:Internship, V:Volunteer, O:Other)
  47. { workplace: '', experience: '', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Full-time' },
  48. { workplace: '', experience: '', jobType: 'P', verified: '', applyWithLinkedin: '', description: 'Part-time' },
  49. { workplace: '', experience: '', jobType: 'C', verified: '', applyWithLinkedin: '', description: 'Contract' },
  50. { workplace: '', experience: '', jobType: 'T', verified: '', applyWithLinkedin: '', description: 'Temporary' },
  51. { workplace: '', experience: '', jobType: 'I', verified: '', applyWithLinkedin: '', description: 'Internship jobs' },
  52.  
  53. // Special filters
  54. { workplace: '', experience: '', jobType: '', verified: ',verifiedJob:List(true)', applyWithLinkedin: '', description: 'Verified jobs only' },
  55. { workplace: '', experience: '', jobType: '', verified: '', applyWithLinkedin: ',applyWithLinkedin:List(true)', description: 'Easy Apply jobs' },
  56.  
  57. // Combined filters for more granular results
  58. { workplace: '2', experience: '2', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Entry-level Full-time' },
  59. { workplace: '2', experience: '3', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Associate Full-time' },
  60. { workplace: '2', experience: '4', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'Remote Mid-Senior Full-time' },
  61. { workplace: '1', experience: '2', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Entry-level Full-time' },
  62. { workplace: '1', experience: '3', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Associate Full-time' },
  63. { workplace: '1', experience: '4', jobType: 'F', verified: '', applyWithLinkedin: '', description: 'On-site Mid-Senior Full-time' },
  64. ];
  65.  
  66. // Database instance
  67. let db;
  68.  
  69. // Initialize database
  70. function initDatabase() {
  71. return new Promise((resolve, reject) => {
  72. // Create data directory if it doesn't exist
  73. const dataDir = path.dirname(CONFIG.dbPath);
  74. if (!fs.existsSync(dataDir)) {
  75. fs.mkdirSync(dataDir, { recursive: true });
  76. }
  77.  
  78. db = new sqlite3.Database(CONFIG.dbPath, (err) => {
  79. if (err) {
  80. reject(err);
  81. } else {
  82. console.log('Connected to SQLite database');
  83. // Read and execute schema
  84. const schemaPath = path.join(__dirname, 'schema.sql');
  85. if (fs.existsSync(schemaPath)) {
  86. const schema = fs.readFileSync(schemaPath, 'utf8');
  87. db.exec(schema, (err) => {
  88. if (err) {
  89. reject(err);
  90. } else {
  91. console.log('Database schema initialized');
  92. resolve();
  93. }
  94. });
  95. } else {
  96. resolve();
  97. }
  98. }
  99. });
  100. });
  101. }
  102.  
  103. // Sleep function
  104. function sleep(ms) {
  105. return new Promise(resolve => setTimeout(resolve, ms));
  106. }
  107.  
  108. // Rate limiter - ensures we don't exceed maxCallsPerMinute
  109. async function waitForRateLimit() {
  110. const now = Date.now();
  111. const elapsed = now - rateLimiter.minuteStartTime;
  112.  
  113. // Reset counter if a minute has passed
  114. if (elapsed >= 60000) {
  115. rateLimiter.callsThisMinute = 0;
  116. rateLimiter.minuteStartTime = now;
  117. return;
  118. }
  119.  
  120. // If we've hit the limit, wait until the next minute
  121. if (rateLimiter.callsThisMinute >= CONFIG.maxCallsPerMinute) {
  122. const timeToWait = 60000 - elapsed;
  123. console.log(` ⏱ Rate limit reached (${CONFIG.maxCallsPerMinute} calls/min). Waiting ${Math.ceil(timeToWait / 1000)}s...`);
  124. await sleep(timeToWait);
  125. rateLimiter.callsThisMinute = 0;
  126. rateLimiter.minuteStartTime = Date.now();
  127. }
  128.  
  129. rateLimiter.callsThisMinute++;
  130. }
  131.  
  132. // Execute requests continuously with streaming processing (with rate limit)
  133. async function executeWithStreamingRateLimit(asyncFunctions, onResultCallback) {
  134. const totalRequests = asyncFunctions.length;
  135. let completedRequests = 0;
  136. let requestIndex = 0;
  137. const maxConcurrent = 220; // Increased concurrent requests for faster execution
  138.  
  139. const activePromises = new Set();
  140.  
  141. async function executeRequest(fn, index) {
  142. try {
  143. // Wait for rate limit before executing
  144. await waitForRateLimit();
  145.  
  146. // Execute the actual fetch
  147. const result = await fn();
  148.  
  149. completedRequests++;
  150.  
  151. // Process result immediately
  152. if (onResultCallback) {
  153. await onResultCallback(result, completedRequests, totalRequests);
  154. }
  155. } catch (err) {
  156. completedRequests++;
  157. console.error(` ✗ Request ${index + 1} failed:`, err.message);
  158. }
  159. }
  160.  
  161. // Start and maintain pool of concurrent requests
  162. while (requestIndex < totalRequests || activePromises.size > 0) {
  163. // Fill up to maxConcurrent
  164. while (activePromises.size < maxConcurrent && requestIndex < totalRequests) {
  165. const currentIndex = requestIndex++;
  166. const fn = asyncFunctions[currentIndex];
  167. const promise = executeRequest(fn, currentIndex);
  168. activePromises.add(promise);
  169.  
  170. // Remove from set when complete
  171. promise.finally(() => {
  172. activePromises.delete(promise);
  173. });
  174. }
  175.  
  176. // Wait for at least one request to complete before continuing
  177. if (activePromises.size > 0) {
  178. await Promise.race(activePromises);
  179. }
  180. }
  181. }
  182.  
  183. // Helper function to resolve URN references
  184. function resolveUrn(urn, included) {
  185. if (!urn) return null;
  186. const entity = included.find(item => item.entityUrn === urn);
  187. return entity || null;
  188. }
  189.  
  190. // Helper function to extract ID from URN
  191. function extractId(urn) {
  192. if (!urn) return null;
  193. const match = urn.match(/(\d+)/);
  194. return match ? match[1] : null;
  195. }
  196.  
  197. // Clean invalid JSON control characters
  198. function cleanJSON(rawData) {
  199. return rawData.replace(/[\x00-\x1F\x7F-\x9F]/g, (char) => {
  200. if (char === '\t' || char === '\n' || char === '\r') {
  201. return char;
  202. }
  203. return '';
  204. });
  205. }
  206.  
  207. // Parse job listing cards
  208. function parseJobListing(responseData) {
  209. const jobs = [];
  210. const included = responseData.included || [];
  211.  
  212. const jobCards = included.filter(item =>
  213. item.$type === 'com.linkedin.voyager.dash.jobs.JobPostingCard' &&
  214. item.entityUrn && item.entityUrn.includes('JOBS_SEARCH')
  215. );
  216.  
  217. jobCards.forEach(card => {
  218. const jobData = {
  219. id: extractId(card.jobPostingUrn || card.entityUrn),
  220. entityUrn: card.jobPostingUrn,
  221. cardEntityUrn: card.entityUrn,
  222. title: card.title?.text || card.jobPostingTitle || 'N/A',
  223. company: card.primaryDescription?.text || 'N/A',
  224. location: card.secondaryDescription?.text || 'N/A',
  225. trackingId: card.trackingId,
  226. referenceId: card.referenceId,
  227. verified: card.title?.accessibilityText?.includes('verification') || false,
  228. companyId: null,
  229. companyUrn: null,
  230. companyLogoAlt: null
  231. };
  232.  
  233. if (card.footerItems && card.footerItems.length > 0) {
  234. const footerData = {};
  235. card.footerItems.forEach(item => {
  236. if (item.type === 'LISTED_DATE' && item.timeAt) {
  237. footerData.listedAt = new Date(item.timeAt).toISOString();
  238. footerData.listedAtTimestamp = item.timeAt;
  239. } else if (item.type === 'EASY_APPLY_TEXT') {
  240. footerData.easyApply = true;
  241. } else if (item.text) {
  242. footerData[item.type] = item.text.text;
  243. }
  244. });
  245. jobData.footer = footerData;
  246. }
  247.  
  248. if (card.logo?.attributes?.[0]) {
  249. const logoAttr = card.logo.attributes[0];
  250. const companyLogoUrn = logoAttr.detailDataUnion?.companyLogo || logoAttr.detailData?.['*companyLogo'];
  251. if (companyLogoUrn) {
  252. jobData.companyId = extractId(companyLogoUrn);
  253. jobData.companyUrn = companyLogoUrn;
  254. }
  255. if (card.logo.accessibilityText) {
  256. jobData.companyLogoAlt = card.logo.accessibilityText;
  257. }
  258. }
  259.  
  260. const jobPostingUrn = card['*jobPosting'] || card.jobPostingUrn;
  261. if (jobPostingUrn) {
  262. const jobPosting = resolveUrn(jobPostingUrn, included);
  263. if (jobPosting) {
  264. jobData.details = {
  265. title: jobPosting.title,
  266. trackingUrn: jobPosting.trackingUrn,
  267. repostedJob: jobPosting.repostedJob,
  268. contentSource: jobPosting.contentSource,
  269. posterId: jobPosting.posterId
  270. };
  271. }
  272. }
  273.  
  274. if (card.jobInsightsV2) {
  275. jobData.insights = card.jobInsightsV2;
  276. }
  277.  
  278. if (card.tertiaryDescription?.text) {
  279. jobData.tertiaryInfo = card.tertiaryDescription.text;
  280. }
  281.  
  282. jobs.push(jobData);
  283. });
  284.  
  285. return jobs;
  286. }
  287.  
  288. // Fetch jobs for a specific offset and filter
  289. async function fetchJobsAtOffset(start, geoId, filter) {
  290. const { workplace, experience, jobType, verified, applyWithLinkedin } = filter;
  291.  
  292. // Build filter string
  293. let filterStr = '';
  294. if (workplace) filterStr += `,workplaceType:List(${workplace})`;
  295. if (experience) filterStr += `,experience:List(${experience})`;
  296. if (jobType) filterStr += `,jobType:List(${jobType})`;
  297. filterStr += verified;
  298. filterStr += applyWithLinkedin;
  299.  
  300. const url = `${CONFIG.baseUrl}?decorationId=${CONFIG.decorationId}&count=${CONFIG.count}&q=jobSearch&query=(origin:JOB_SEARCH_PAGE_JOB_FILTER,locationUnion:(geoId:${geoId}),selectedFilters:(timePostedRange:List(r2592000)${filterStr}),spellCorrectionEnabled:true)&start=${start}`;
  301.  
  302. const curlCommand = `curl -s '${url}' \
  303. -H 'accept: application/vnd.linkedin.normalized+json+2.1' \
  304. -H 'accept-language: en-US,en-GB;q=0.9,en;q=0.8' \
  305. -H 'cache-control: no-cache' \
  306. -b '${CONFIG.cookies}' \
  307. -H 'csrf-token: ${CONFIG.csrfToken}' \
  308. -H 'dnt: 1' \
  309. -H 'pragma: no-cache' \
  310. -H 'priority: u=1, i' \
  311. -H 'referer: https://www.linkedin.com/jobs/search/?currentJobId=4329808219&f_TPR=r2592000&geoId=91000000&origin=JOB_SEARCH_PAGE_JOB_FILTER' \
  312. -H 'sec-ch-prefers-color-scheme: dark' \
  313. -H 'sec-ch-ua: "Google Chrome";v="143", "Chromium";v="143", "Not A(Brand";v=24"' \
  314. -H 'sec-ch-ua-mobile: ?0' \
  315. -H 'sec-ch-ua-platform: "macOS"' \
  316. -H 'sec-fetch-dest: empty' \
  317. -H 'sec-fetch-mode: cors' \
  318. -H 'sec-fetch-site: same-origin' \
  319. -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36' \
  320. -H 'x-li-lang: en_US' \
  321. -H 'x-li-page-instance: urn:li:page:d_flagship3_search_srp_jobs;PSIhsQSvS/Omd2tT0L8VFw==' \
  322. -H 'x-li-pem-metadata: Voyager - Careers - Jobs Search=jobs-search-results,Voyager - Careers - Critical - careers-api=jobs-search-results' \
  323. -H 'x-li-prefetch: 1' \
  324. -H 'x-li-track: {"clientVersion":"1.13.41695","mpVersion":"1.13.41695","osName":"web","timezoneOffset":1,"timezone":"Africa/Tunis","deviceFormFactor":"DESKTOP","mpName":"voyager-web","displayDensity":2,"displayWidth":2940,"displayHeight":1912}' \
  325. -H 'x-restli-protocol-version: 2.0.0'`;
  326.  
  327. try {
  328. const { stdout: rawResponse } = await execAsync(curlCommand, {
  329. encoding: 'utf8',
  330. maxBuffer: 10 * 1024 * 1024
  331. });
  332. const cleanedResponse = cleanJSON(rawResponse);
  333. const data = JSON.parse(cleanedResponse);
  334. return data;
  335. } catch (error) {
  336. console.error(` ✗ Error fetching jobs at offset ${start}:`, error.message);
  337. return null;
  338. }
  339. }
  340.  
  341. // Insert location into database
  342. function insertLocation(location) {
  343. return new Promise((resolve, reject) => {
  344. const stmt = db.prepare(`
  345. INSERT OR IGNORE INTO locations (geo_id, geo_urn, name, country_code, search_keyword)
  346. VALUES (?, ?, ?, ?, ?)
  347. `);
  348. stmt.run(location.geoId, location.geoUrn, location.name, location.countryCode, location.searchKeyword, (err) => {
  349. if (err) reject(err);
  350. else resolve();
  351. });
  352. stmt.finalize();
  353. });
  354. }
  355.  
  356. // Insert company into database (with duplicate check)
  357. function insertCompany(job) {
  358. return new Promise((resolve, reject) => {
  359. if (!job.companyId) {
  360. resolve();
  361. return;
  362. }
  363.  
  364. // Check if company already exists
  365. const checkStmt = db.prepare('SELECT company_id FROM companies WHERE company_id = ?');
  366. checkStmt.get(job.companyId, (err, row) => {
  367. checkStmt.finalize();
  368.  
  369. if (err) {
  370. reject(err);
  371. return;
  372. }
  373.  
  374. if (row) {
  375. // Company exists, just update the updated_at timestamp
  376. const updateStmt = db.prepare(`
  377. UPDATE companies
  378. SET company_name = ?, company_logo_alt = ?, updated_at = CURRENT_TIMESTAMP
  379. WHERE company_id = ?
  380. `);
  381. updateStmt.run(job.company, job.companyLogoAlt, job.companyId, (err) => {
  382. if (err) reject(err);
  383. else resolve();
  384. });
  385. updateStmt.finalize();
  386. } else {
  387. // Insert new company
  388. const insertStmt = db.prepare(`
  389. INSERT INTO companies (company_id, company_urn, company_name, company_logo_alt)
  390. VALUES (?, ?, ?, ?)
  391. `);
  392. insertStmt.run(job.companyId, job.companyUrn, job.company, job.companyLogoAlt, (err) => {
  393. if (err) reject(err);
  394. else resolve();
  395. });
  396. insertStmt.finalize();
  397. }
  398. });
  399. });
  400. }
  401.  
  402. // Insert job into database (with duplicate check)
  403. function insertJob(job, geoId, runId, filter) {
  404. return new Promise((resolve, reject) => {
  405. // Check if job already exists
  406. const checkStmt = db.prepare('SELECT id FROM jobs WHERE id = ?');
  407. checkStmt.get(job.id, (err, row) => {
  408. checkStmt.finalize();
  409.  
  410. if (err) {
  411. reject(err);
  412. return;
  413. }
  414.  
  415. const isUpdate = !!row;
  416.  
  417. // Insert or update job
  418. const jobStmt = db.prepare(isUpdate ? `
  419. UPDATE jobs SET
  420. entity_urn = ?, card_entity_urn = ?, title = ?, company = ?, company_id = ?,
  421. location = ?, tracking_id = ?, reference_id = ?, verified = ?, easy_apply = ?,
  422. listed_at = ?, listed_at_timestamp = ?, content_source = ?, poster_id = ?,
  423. reposted_job = ?, company_logo_alt = ?, tertiary_info = ?, updated_at = CURRENT_TIMESTAMP
  424. WHERE id = ?
  425. ` : `
  426. INSERT INTO jobs (
  427. id, entity_urn, card_entity_urn, title, company, company_id,
  428. location, tracking_id, reference_id, verified, easy_apply,
  429. listed_at, listed_at_timestamp, content_source, poster_id,
  430. reposted_job, company_logo_alt, tertiary_info
  431. ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
  432. `);
  433.  
  434. const params = isUpdate ? [
  435. job.entityUrn,
  436. job.cardEntityUrn,
  437. job.title,
  438. job.company,
  439. job.companyId || null,
  440. job.location,
  441. job.trackingId,
  442. job.referenceId,
  443. job.verified ? 1 : 0,
  444. job.footer?.easyApply ? 1 : 0,
  445. job.footer?.listedAt || null,
  446. job.footer?.listedAtTimestamp || null,
  447. job.details?.contentSource || null,
  448. job.details?.posterId || null,
  449. job.details?.repostedJob ? 1 : 0,
  450. job.companyLogoAlt || null,
  451. job.tertiaryInfo || null,
  452. job.id // WHERE clause
  453. ] : [
  454. job.id,
  455. job.entityUrn,
  456. job.cardEntityUrn,
  457. job.title,
  458. job.company,
  459. job.companyId || null,
  460. job.location,
  461. job.trackingId,
  462. job.referenceId,
  463. job.verified ? 1 : 0,
  464. job.footer?.easyApply ? 1 : 0,
  465. job.footer?.listedAt || null,
  466. job.footer?.listedAtTimestamp || null,
  467. job.details?.contentSource || null,
  468. job.details?.posterId || null,
  469. job.details?.repostedJob ? 1 : 0,
  470. job.companyLogoAlt || null,
  471. job.tertiaryInfo || null
  472. ];
  473.  
  474. jobStmt.run(...params, (jobErr) => {
  475. jobStmt.finalize();
  476.  
  477. if (jobErr) {
  478. reject(jobErr);
  479. return;
  480. }
  481.  
  482. // Check if job-location mapping exists
  483. const checkMapStmt = db.prepare('SELECT id FROM job_locations WHERE job_id = ? AND geo_id = ?');
  484. checkMapStmt.get(job.id, geoId, (mapCheckErr, mapRow) => {
  485. checkMapStmt.finalize();
  486.  
  487. if (mapCheckErr) {
  488. reject(mapCheckErr);
  489. return;
  490. }
  491.  
  492. if (!mapRow) {
  493. // Insert job-location mapping only if it doesn't exist
  494. const mapStmt = db.prepare(`
  495. INSERT INTO job_locations (job_id, geo_id)
  496. VALUES (?, ?)
  497. `);
  498. mapStmt.run(job.id, geoId, (mapErr) => {
  499. mapStmt.finalize();
  500. if (mapErr) {
  501. reject(mapErr);
  502. return;
  503. }
  504. insertJobFilter();
  505. });
  506. } else {
  507. insertJobFilter();
  508. }
  509.  
  510. function insertJobFilter() {
  511. // Check if job-filter mapping exists
  512. const checkFilterStmt = db.prepare(
  513. 'SELECT id FROM job_filters WHERE job_id = ? AND scraping_run_id = ?'
  514. );
  515. checkFilterStmt.get(job.id, runId, (filterCheckErr, filterRow) => {
  516. checkFilterStmt.finalize();
  517.  
  518. if (filterCheckErr) {
  519. reject(filterCheckErr);
  520. return;
  521. }
  522.  
  523. if (!filterRow) {
  524. // Insert job-filter mapping only if it doesn't exist
  525. const filterStmt = db.prepare(`
  526. INSERT INTO job_filters (
  527. job_id, scraping_run_id, geo_id,
  528. workplace_type, experience_level, job_type,
  529. verified_only, apply_with_linkedin, filter_description
  530. ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
  531. `);
  532. filterStmt.run(
  533. job.id,
  534. runId,
  535. geoId,
  536. filter.workplace || null,
  537. filter.experience || null,
  538. filter.jobType || null,
  539. filter.verified ? 1 : 0,
  540. filter.applyWithLinkedin ? 1 : 0,
  541. filter.description,
  542. (filterErr) => {
  543. filterStmt.finalize();
  544. if (filterErr) reject(filterErr);
  545. else resolve();
  546. }
  547. );
  548. } else {
  549. // Already exists, just resolve
  550. resolve();
  551. }
  552. });
  553. }
  554. });
  555. });
  556. });
  557. });
  558. }
  559.  
  560. // Create scraping run record
  561. function createScrapingRun(geoId, filter) {
  562. return new Promise((resolve, reject) => {
  563. const stmt = db.prepare(`
  564. INSERT INTO scraping_runs (
  565. geo_id, workplace_type, experience_level, job_type,
  566. verified_only, apply_with_linkedin, started_at, status
  567. ) VALUES (?, ?, ?, ?, ?, ?, datetime('now'), 'running')
  568. `);
  569.  
  570. stmt.run(
  571. geoId,
  572. filter.workplace || null,
  573. filter.experience || null,
  574. filter.jobType || null,
  575. filter.verified ? 1 : 0,
  576. filter.applyWithLinkedin ? 1 : 0,
  577. function (err) {
  578. if (err) reject(err);
  579. else resolve(this.lastID);
  580. }
  581. );
  582. stmt.finalize();
  583. });
  584. }
  585.  
  586. // Update scraping run record
  587. function updateScrapingRun(runId, stats) {
  588. return new Promise((resolve, reject) => {
  589. const stmt = db.prepare(`
  590. UPDATE scraping_runs SET
  591. total_jobs_fetched = ?,
  592. successful_requests = ?,
  593. failed_requests = ?,
  594. completed_at = datetime('now'),
  595. status = ?
  596. WHERE id = ?
  597. `);
  598.  
  599. stmt.run(
  600. stats.totalJobs,
  601. stats.successfulRequests,
  602. stats.failedRequests,
  603. stats.status,
  604. runId,
  605. (err) => {
  606. if (err) reject(err);
  607. else resolve();
  608. }
  609. );
  610. stmt.finalize();
  611. });
  612. }
  613.  
  614. // Fetch jobs for a specific filter combination and return metadata
  615. async function fetchJobsForFilter(geoId, filter, runId) {
  616. const totalRequests = Math.ceil(CONFIG.maxResultsPerFilter / CONFIG.count);
  617.  
  618. // Create array of async fetch functions for this filter
  619. const fetchFunctions = [];
  620. for (let i = 0; i < totalRequests; i++) {
  621. const start = i * CONFIG.count;
  622. fetchFunctions.push({
  623. fn: async () => ({
  624. offset: start,
  625. index: i,
  626. filter: filter,
  627. runId: runId,
  628. geoId: geoId,
  629. data: await fetchJobsAtOffset(start, geoId, filter)
  630. }),
  631. filterDescription: filter.description
  632. });
  633. }
  634.  
  635. return fetchFunctions;
  636. }
  637.  
  638.  
  639. // Main function
  640. async function fetchAllJobs() {
  641. console.log('='.repeat(100));
  642. console.log('LinkedIn Jobs Scraper - Database Version');
  643. console.log('='.repeat(100));
  644. console.log();
  645.  
  646. // Initialize database
  647. await initDatabase();
  648.  
  649. // Load geo IDs
  650. const geoData = JSON.parse(fs.readFileSync(CONFIG.geoIdsFile, 'utf8'));
  651. const locations = geoData.locations;
  652.  
  653. console.log(`Loaded ${locations.length} locations from ${CONFIG.geoIdsFile}`);
  654. console.log(`Will use ${FILTER_COMBINATIONS.length} filter combinations per location`);
  655. console.log();
  656.  
  657. // Parse command-line argument for start index
  658. const startIndex = parseInt(process.argv[2]) || 0;
  659. let totalJobsCollected = 0;
  660.  
  661. if (startIndex > 0) {
  662. console.log(`⚠️ Resuming from index ${startIndex}`);
  663. console.log();
  664. }
  665.  
  666. const totalLocations = locations.length - startIndex;
  667. console.log(`Processing ${totalLocations} locations (${startIndex} → ${locations.length - 1})`);
  668. console.log();
  669.  
  670. let totalLocationsProcessed = 0;
  671.  
  672. // Process each location starting from resume index
  673. for (let i = startIndex; i < locations.length; i++) {
  674. const location = locations[i];
  675. totalLocationsProcessed++;
  676.  
  677. console.log('='.repeat(100));
  678. console.log(`[${i}/${locations.length - 1}] ${location.name} (${location.countryCode || 'N/A'})`);
  679. console.log(`Geo ID: ${location.geoId} | Progress: ${totalLocationsProcessed}/${totalLocations} locations`);
  680. console.log('='.repeat(100));
  681. console.log();
  682.  
  683. // Insert location into database
  684. await insertLocation(location);
  685.  
  686. // Create scraping runs for all filters
  687. console.log(`Creating scraping runs for ${FILTER_COMBINATIONS.length} filters...`);
  688. const filterRunIds = [];
  689. for (const filter of FILTER_COMBINATIONS) {
  690. const runId = await createScrapingRun(location.geoId, filter);
  691. filterRunIds.push({ filter, runId });
  692. }
  693.  
  694. // Collect ALL API calls across ALL filters
  695. console.log(`Preparing API calls for all filters...`);
  696. const allFetchFunctions = [];
  697. for (const { filter, runId } of filterRunIds) {
  698. const filterFetchFunctions = await fetchJobsForFilter(location.geoId, filter, runId);
  699. allFetchFunctions.push(...filterFetchFunctions.map(f => ({ ...f, runId })));
  700. }
  701.  
  702. const totalApiCalls = allFetchFunctions.length;
  703. console.log(`Executing ${totalApiCalls} API calls across ${FILTER_COMBINATIONS.length} filters with streaming processing (220 concurrent requests, max 350/min)...`);
  704. console.log();
  705.  
  706. // Track results by filter as they come in
  707. const resultsByFilter = {};
  708. const filterStats = {};
  709.  
  710. // Initialize stats tracking
  711. for (const { filter } of filterRunIds) {
  712. resultsByFilter[filter.description] = [];
  713. filterStats[filter.description] = { completed: 0, total: 0, jobs: 0 };
  714. }
  715.  
  716. // Count total requests per filter
  717. for (const f of allFetchFunctions) {
  718. filterStats[f.filterDescription].total++;
  719. }
  720.  
  721. let totalJobsProcessed = 0;
  722. const startTime = Date.now();
  723.  
  724. // Execute with streaming processing - results are processed as they arrive
  725. await executeWithStreamingRateLimit(
  726. allFetchFunctions.map(f => f.fn),
  727. async (result, completed, total) => {
  728. const filterDesc = result.filter.description;
  729. resultsByFilter[filterDesc].push(result);
  730. filterStats[filterDesc].completed++;
  731.  
  732. // Process this individual result immediately
  733. if (result.data) {
  734. const jobs = parseJobListing(result.data);
  735.  
  736. if (jobs.length > 0) {
  737. // Save to database immediately
  738. for (const job of jobs) {
  739. try {
  740. await insertCompany(job);
  741. await insertJob(job, result.geoId, result.runId, result.filter);
  742. totalJobsProcessed++;
  743. } catch (err) {
  744. // Silently continue on duplicate errors
  745. }
  746. }
  747. filterStats[filterDesc].jobs += jobs.length;
  748. }
  749. }
  750.  
  751. // Show progress every 10 completed requests
  752. if (completed % 10 === 0 || completed === total) {
  753. const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
  754. const rate = (completed / (elapsed / 60)).toFixed(1);
  755. process.stdout.write(`\r Progress: ${completed}/${total} requests (${rate}/min) | ${totalJobsProcessed} jobs saved`);
  756. }
  757. }
  758. );
  759.  
  760. const duration = ((Date.now() - startTime) / 1000).toFixed(1);
  761. console.log();
  762. console.log();
  763. console.log(`✓ All API calls completed in ${duration}s`);
  764. console.log();
  765.  
  766. // Update scraping runs and show summary
  767. console.log('Filter Summary:');
  768. let locationJobsCount = 0;
  769. for (const { filter, runId } of filterRunIds) {
  770. const filterResults = resultsByFilter[filter.description] || [];
  771. const stats = filterStats[filter.description];
  772.  
  773. // Count successful/failed requests
  774. let successful = 0, failed = 0;
  775. for (const r of filterResults) {
  776. if (r.data) successful++;
  777. else failed++;
  778. }
  779.  
  780. // Update scraping run
  781. await updateScrapingRun(runId, {
  782. totalJobs: stats.jobs,
  783. successfulRequests: successful,
  784. failedRequests: failed,
  785. status: failed === filterResults.length ? 'failed' : 'completed'
  786. });
  787.  
  788. locationJobsCount += stats.jobs;
  789. console.log(` ${filter.description.padEnd(40)} → ${stats.jobs} jobs (${successful}/${stats.total} successful)`);
  790. }
  791.  
  792. totalJobsCollected += locationJobsCount;
  793. console.log();
  794. console.log(` ✓ Location complete: ${locationJobsCount} jobs collected`);
  795. console.log(` Running total: ${totalJobsCollected} jobs across ${totalLocationsProcessed} locations`);
  796.  
  797. // Delay between locations
  798. if (i < locations.length - 1) {
  799. console.log(`\n Waiting ${CONFIG.delayBetweenGeoIds}ms before next location...`);
  800. await sleep(CONFIG.delayBetweenGeoIds);
  801. }
  802. }
  803.  
  804. console.log();
  805. console.log('='.repeat(100));
  806. console.log('SCRAPING COMPLETE');
  807. console.log('='.repeat(100));
  808. console.log(`Total unique jobs collected: ${totalJobsCollected}`);
  809. console.log(`Total locations processed: ${totalLocationsProcessed}`);
  810. console.log(`Database: ${CONFIG.dbPath}`);
  811. console.log();
  812. console.log('All done! 🎉');
  813.  
  814. // Close database
  815. db.close();
  816. }
  817.  
  818. // Graceful shutdown handler
  819. // process.on('SIGINT', () => {
  820. // console.log('\n\n⚠️ Interrupt received!');
  821. // console.log(' To resume, run: node fetch-all-jobs-db.js <last_completed_index + 1>');
  822. // if (db) db.close();
  823. // process.exit(0);
  824. // });
  825.  
  826. // Run the scraper
  827. fetchAllJobs().catch(error => {
  828. console.error('Fatal error:', error);
  829. if (db) db.close();
  830. process.exit(1);
  831. });
  832.  
Advertisement
Add Comment
Please, Sign In to add comment