Advertisement
Guest User

Untitled

a guest
Mar 13th, 2018
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. $OutputLinksFile = "C:\temp\OutputLinks.txt"
  2.     $InputLinks = @()
  3.  
  4.     $BasePage = "https://archiveofourown.org/tags/Buffy%20the%20Vampire%20Slayer/works?page="
  5.     [int]$FirstPageNumber = "1"
  6.     [int]$LastPageNumber = "2"
  7.     $CurrentPageNumber = $FirstPageNumber
  8.  
  9.     # Make a list of all the pages we want to input, counting from FirstPageNumber to LastPageNumber
  10.     while ($CurrentPageNumber -le $LastPageNumber) {
  11.         $InputLinks += "$BasePage$CurrentPageNumber"
  12.         $CurrentPageNumber++
  13.     }
  14.  
  15.     # If you want to manually input a list of pages instead, remove # in front of the next line:
  16.     #$InputLinks = Get-Content -Path $InputLinksFile
  17.  
  18.     ForEach ($InputLink in $InputLinks) {
  19.         # Fetch the entire page. Get links in page with ().Links. Page is compressed with gzip, so we'll have to account for that
  20.         $InputPageLinks = (Invoke-WebRequest -Uri $InputLink -Headers @{"Accept-Encoding"="gzip"}).Links
  21.         # Filter the link list to only contain links with the sequence "/1/" in it.
  22.         $FilteredOutputLinks = $InputPageLinks | Where-Object {$_.href -like "*/works/*"}
  23.         # The provided links are relative and not absolute, so we need to add the domain name to the output
  24.         foreach ($OutputLink in $FilteredOutputLinks) {
  25.             $OutputLink = $OutputLink.href
  26.             $OutputLink = $OutputLink.Split("/,?,#")[1,2] -join "/"
  27.             $FinalLink = "https://archiveofourown.org/$OutputLink"
  28.             If ($FinalLink -notmatch "Search"){
  29.             Out-File -Append -FilePath $OutputLinksFile -InputObject $FinalLink
  30.             }
  31.         }
  32.         Clear-Variable InputPageLinks
  33.     }
  34.  
  35.     #Remove Dublicates
  36.     $Content = Get-Content $OutputLinksFile
  37.     del $OutputLinksFile
  38.     $Content | get-unique > $OutputLinksFile
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement