Advertisement
opexxx

Get All PDF links from HTML files.ps1

Apr 7th, 2014
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. param ($path, $urlpath)
  2.  
  3. add-type -Path c:\tools\html-agility-pack\HtmlAgilityPack.dll
  4. $files = Get-ChildItem -Include *.htm,*.aspx -Path $path -Recurse
  5. $doc = New-Object HtmlAgilityPack.HtmlDocument
  6. $result = $files | % {
  7.     Write-Host "Checking $_"
  8.     $name = $_.FullName.Replace($path,$urlpath).Replace("\", "/")
  9.     $htmldoc = $doc.Load($_.FullName)
  10.     $linknodes = $doc.DocumentNode.SelectNodes("//a")
  11.     if ($linknodes) {
  12.         foreach ($node in $linknodes) {
  13.             if ($node.GetAttributeValue("href", "").ToLower().Contains("pdf"))
  14.             {
  15.                 Write-Host "Found" $node.GetAttributeValue("href", "")
  16.                 $pdflink = $node.GetAttributeValue("href", "")
  17.                 $line = $node.Line
  18.                 New-Object PsObject -Property @{PdfLink = $pdflink; FileName = $name; LineNumber = $line;}
  19.             }
  20.         }
  21.     }
  22. }
  23. $result | Sort PdfLink
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement