Advertisement
ekostadinov

Bulgarian Jobs Web Portal

Aug 18th, 2014
619
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 2.90 KB | None | 0 0
  1. function CrawlPages ($pagesObj, $jobsURL, $file)
  2. {
  3.     for($p = 0; $p -le $pagesObj.Length - 1; $p++)
  4.     {
  5.        $pageNext = $jobsURL + $pagesObj[$p].href.ToString()
  6.        $reqNew = Invoke-WebRequest $pageNext
  7.        $jobsLinks = $reqNew.Links | where Class -Match "MainLinkBold" | select -Property href
  8.        for($l = 1; $l -le $jobsLinks.Length - 1; $l++)
  9.      {
  10.       $st = $jobsLinks[$l].href.ToString()      
  11.       $nextJob = ""
  12.       $nextJob =  $jobsURL + $st  
  13.       $reqsNew = Invoke-WebRequest $nextJob
  14.       #Start-Sleep -Seconds 1
  15.       $jobNew = ""
  16.       $jobNew = $reqsNew.AllElements | where Class -Match "jobDataView" | select -ExpandProperty innerHTML
  17.       Add-Content $file $jobNew  
  18.       #set back to zero
  19.       $reqsNew = $null
  20.          $st = $null
  21.       $nextJob = $null
  22.       $jobNew = $null
  23.      }  
  24.     }
  25. }
  26. function CrawlJobLinks ($pageLink, $currKeyword, $jobsURL, $file)
  27. {
  28.     $pageLink = $pageLink + $currKeyword
  29.  $req = Invoke-WebRequest $pageLink
  30.  #Start-Sleep -Seconds 1
  31.     $pages = $req.Links | where Class -Match "pathlink" | select -Property href
  32.  $jobLinks = $req.Links | where Class -Match "MainLinkBold" | select -Property href
  33.  #check if we have atleast one page with jobs
  34.     if($jobLinks -ne $null)
  35.     {  
  36.      for($i = 1; $i -le $jobLinks.Length - 1; $i++)
  37.      {
  38.       $str = $jobLinks[$i].href.ToString()      
  39.       $nextlink = ""
  40.       $nextLink =  $jobsURL + $str  
  41.       $reqs = Invoke-WebRequest $nextLink
  42.       #Start-Sleep -Seconds 1
  43.       $job = ""
  44.       $job = $reqs.AllElements | where Class -Match "jobDataView" | select -ExpandProperty innerHTML
  45.       Add-Content $file $job  
  46.       #set back to zero
  47.       $reqs = $null
  48.          $str = $null
  49.       $nextLink = $null
  50.       $job = $null
  51.      }
  52.         #check if there are more than one page with jobs
  53.         if($pages -ne $null)
  54.         {
  55.             #can be done via extension methods or recursion...add PageObjectModel
  56.             CrawlPages -pagesObj $pages -jobsURL $jobsURL -file $file
  57.         }
  58.     }
  59. }
  60. $date = Get-Date
  61. $date = "{0:dd_MM_yyyy}" -f [datetime]$date
  62. $file = 'C:\Users\evgeni.kostadinov\Desktop\webCrawlers\1-JobsTech\all\requs' + $date + '.html'
  63. $db = Get-Content 'C:\Users\evgeni.kostadinov\Desktop\webCrawlers\techsData.txt'
  64. $jobsURL = 'http://www.jobs.bg/'
  65. $title = "Jobs requirements </br>" > $file
  66. for($k = 0; $k -le $db.Length - 1; $k++)
  67. {
  68.     $url =  "http://www.jobs.bg/front_job_search.php?first=1&str_regions=&str_locations=&tab=jobs&old_country=&country=-1&region=0&l_category%5B%5D=0&keyword="
  69.  $searchKeyword = $db[$k]
  70.     $h1 = "<h1 style=""text-align:center;background-color:red;color:white;""> " + $searchKeyword + "</h1>"
  71.  Add-Content $file $h1
  72.     CrawlJobLinks -pageLink $url -currKeyword $searchKeyword -jobsURL $jobsURL -file $file
  73.  #set back to zero
  74.  $req = $null
  75.  $jobLinks = $null
  76.     $url = $null
  77. }
  78. start chrome $file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement