nicuf

Powershell Parsing+Batch-Processor

Jun 16th, 2021 (edited)
571
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. English: https://neculaifantanaru.com/en/creating-a-batch-processing-powershell-with-regex-and-html-tags-parsing.html
  2. Romanian: https://neculaifantanaru.com/creating-a-batch-processing-powershell-with-regex-and-html-tags-parsing.html
  3.  
  4.  
  5. $sourcedir = "C:\Folder1\"
  6. $resultsdir = "C:\Folder1\"
  7.  
  8. Get-ChildItem -Path $sourcedir -Filter *.html | ForEach-Object {
  9.     $content = Get-Content -Path $_.FullName -Raw
  10.    
  11. # Copy the content of the tag <link rel="canonical"  in the tag "OG:URL" and in the tag  "@ID":             #
  12.    
  13.     $replaceValue = (Select-String -InputObject $content -Pattern '(?<=<link rel="canonical" href=").*(")').Matches.Value
  14.     $content = $content -replace '(?<=<meta property="og:url" content=").*(")',$replaceValue
  15.     $content = $content -replace '(?<="@id": ").*(")',$replaceValue
  16.    
  17. # Copy the content of the tag <title>  in the tags ABSTRACT, SUBJECT, OG:TITLE, HEADLINE, KEYWORDS            #
  18.    
  19.     $replaceValue = (Select-String -InputObject $content -Pattern '(?<=<title>).+(?=</title>)').Matches.Value
  20.     $content = $content -replace '(?<=<meta property="og:title" content=").+(?=")',$replaceValue
  21.     $content = $content -replace '(?<=<meta name="abstract" content=").+(?=")',$replaceValue
  22.     $content = $content -replace '(?<=<meta name="keywords" content=").+(?=")',$replaceValue
  23.     $content = $content -replace '(?<=<meta name="Subject" content=").+(?=")',$replaceValue
  24.     $content = $content -replace '(?<="headline": ").+(?=")',$replaceValue
  25.     $content = $content -replace '(?<="keywords": ").+(?=")',$replaceValue
  26.    
  27. # Copy the content of the tag <meta name="description"  in the tags "OG:DESCRIPTION" and in the tag "description": "        #
  28.    
  29.     $replaceValue = (Select-String -InputObject $content -Pattern '(?<=<meta name="description" content=").+(?=")').Matches.Value
  30.     $content = $content -replace '(?<=<meta property="og:description" content=").+(?=")',$replaceValue
  31.     $content = $content -replace '(?<="description": ").+(?=")',$replaceValue
  32.  
  33.    Set-Content -Path $resultsdir\$($_.name) $content
  34. }
RAW Paste Data