Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- $OutputLinksFile = "C:\temp\OutputLinks.txt"
- $InputLinks = @()
- $BasePage = "https://archiveofourown.org/tags/Buffy%20the%20Vampire%20Slayer/works?page="
- [int]$FirstPageNumber = "1"
- [int]$LastPageNumber = "2"
- $CurrentPageNumber = $FirstPageNumber
- # Make a list of all the pages we want to input, counting from FirstPageNumber to LastPageNumber
- while ($CurrentPageNumber -le $LastPageNumber) {
- $InputLinks += "$BasePage$CurrentPageNumber"
- $CurrentPageNumber++
- }
- # If you want to manually input a list of pages instead, remove # in front of the next line:
- #$InputLinks = Get-Content -Path $InputLinksFile
- ForEach ($InputLink in $InputLinks) {
- # Fetch the entire page. Get links in page with ().Links. Page is compressed with gzip, so we'll have to account for that
- $InputPageLinks = (Invoke-WebRequest -Uri $InputLink -Headers @{"Accept-Encoding"="gzip"}).Links
- # Filter the link list to only contain links with the sequence "/1/" in it.
- $FilteredOutputLinks = $InputPageLinks | Where-Object {$_.href -like "*/works/*"}
- # The provided links are relative and not absolute, so we need to add the domain name to the output
- foreach ($OutputLink in $FilteredOutputLinks) {
- $OutputLink = $OutputLink.href
- $OutputLink = $OutputLink.Split("/,?,#")[1,2] -join "/"
- $FinalLink = "https://archiveofourown.org/$OutputLink"
- If ($FinalLink -notmatch "Search"){
- Out-File -Append -FilePath $OutputLinksFile -InputObject $FinalLink
- }
- }
- Clear-Variable InputPageLinks
- }
- #Remove Dublicates
- $Content = Get-Content $OutputLinksFile
- del $OutputLinksFile
- $Content | get-unique > $OutputLinksFile
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement