Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ' ===================================================================
- ' VBA Subroutine to Dynamically Extract Meta Tag Data from URLs
- ' - Reads required fields from a "Config" worksheet.
- ' - Correctly handles and joins repeating fields.
- ' - Formats the final output as a named Excel Table.
- ' - FINAL VERSION 3.0
- ' ===================================================================
- ' excel set up instructions.
- ' ===================================================================
- ' right click html page and view source
- ' copy the meta tags and paste into excel
- ' text to columns with " delimiter - that gives you all the fields
- ' put a header at the top of column A "whatever"
- ' pivot table on that col, copy paste the pivot table as values - that gives you the unique names as citation_author repeat
- '
- ' Create Worksheets
- ' - URLs
- ' - Journal Data
- ' - Config
- '
- ' Paste URLs and Config values in Col A starting from Row 2
- ' ===================================================================
- Sub ExtractJournalMetaData_WithTableFormatting()
- ' --- Variable Declarations ---
- Dim http As Object
- Dim urlSheet As Worksheet, dataSheet As Worksheet, configSheet As Worksheet
- Dim urlCell As Range, urlRange As Range, dataRange As Range
- Dim newTable As ListObject
- Dim fieldsToExtract As Variant
- Dim lastRow As Long, lastConfigRow As Long, currentRow As Long, i As Long, j As Long
- ' --- Setup: Define Worksheets ---
- On Error Resume Next
- Set urlSheet = ThisWorkbook.Sheets("URLs")
- Set dataSheet = ThisWorkbook.Sheets("Journal Data")
- Set configSheet = ThisWorkbook.Sheets("Config")
- If urlSheet Is Nothing Or configSheet Is Nothing Then
- MsgBox "Error: Please ensure you have both a 'URLs' sheet and a 'Config' sheet.", vbCritical
- Exit Sub
- End If
- If dataSheet Is Nothing Then
- Set dataSheet = ThisWorkbook.Sheets.Add(After:=urlSheet)
- dataSheet.Name = "Journal Data"
- End If
- On Error GoTo 0
- ' --- Read Configuration ---
- lastConfigRow = configSheet.Cells(configSheet.Rows.Count, "A").End(xlUp).Row
- If lastConfigRow < 2 Then
- MsgBox "The 'Config' sheet is empty. Please list fields to extract in column A, starting at A2.", vbExclamation
- Exit Sub
- End If
- fieldsToExtract = configSheet.Range("A2:A" & lastConfigRow).Value
- ' --- Setup: Prepare Data Sheet ---
- dataSheet.Cells.ClearContents
- dataSheet.Range("A1").Value = "URL"
- For i = 1 To UBound(fieldsToExtract, 1)
- dataSheet.Cells(1, i + 1).Value = fieldsToExtract(i, 1)
- Next i
- dataSheet.Rows(1).Font.Bold = True
- ' --- Input Validation ---
- lastRow = urlSheet.Cells(urlSheet.Rows.Count, "A").End(xlUp).Row
- If lastRow < 2 Then
- MsgBox "No URLs found in the 'URLs' sheet. Please paste them starting from cell A2.", vbInformation
- Exit Sub
- End If
- Set urlRange = urlSheet.Range("A2:A" & lastRow)
- ' --- Main Processing Loop ---
- Set http = CreateObject("MSXML2.XMLHTTP.6.0")
- currentRow = 2
- For Each urlCell In urlRange.Cells
- url = Trim(urlCell.Value)
- Application.StatusBar = "Processing " & (currentRow - 1) & " of " & urlRange.Count & ": " & url
- dataSheet.Cells(currentRow, 1).Value = url
- If Left(url, 4) = "http" Then
- On Error Resume Next
- http.Open "GET", url, False
- http.Send
- On Error GoTo 0
- If http.Status = 200 Then
- htmlContent = http.responseText
- For i = 1 To UBound(fieldsToExtract, 1)
- extractedValue = GetAllMetaContent(htmlContent, fieldsToExtract(i, 1))
- dataSheet.Cells(currentRow, i + 1).Value = extractedValue
- Next i
- Else
- For i = 1 To UBound(fieldsToExtract, 1)
- dataSheet.Cells(currentRow, i + 1).Value = "Error: HTTP " & http.Status
- Next i
- End If
- Else
- For i = 1 To UBound(fieldsToExtract, 1)
- dataSheet.Cells(currentRow, i + 1).Value = "Invalid URL"
- Next i
- End If
- currentRow = currentRow + 1
- Next urlCell
- Application.StatusBar = False
- ' ==========================================================
- ' --- NEW: Final Formatting Section ---
- ' ==========================================================
- ' Define the full range of the data we just created
- Set dataRange = dataSheet.Range("A1").CurrentRegion
- ' Delete any existing table with the same name to prevent errors
- On Error Resume Next
- dataSheet.ListObjects("tblJournalExtract").Delete
- On Error GoTo 0
- ' 1. Convert the range into a named Excel Table
- Set newTable = dataSheet.ListObjects.Add(xlSrcRange, dataRange, , xlYes)
- newTable.Name = "tblJournalExtract"
- newTable.TableStyle = "TableStyleMedium2" ' Optional: Apply a default style
- ' 2. Set vertical alignment and text wrapping for the whole table
- With newTable.Range
- .VerticalAlignment = xlTop
- .WrapText = True
- End With
- ' 3. Loop through each column to set widths
- For j = 1 To newTable.ListColumns.Count
- ' Autofit the column first to get its ideal width
- newTable.ListColumns(j).Range.Columns.AutoFit
- ' If the autofitted width is more than 60, cap it at 60
- If newTable.ListColumns(j).Range.Columns.ColumnWidth > 60 Then
- newTable.ListColumns(j).Range.Columns.ColumnWidth = 60
- End If
- Next j
- ' ==========================================================
- ' --- End of New Formatting Section ---
- ' ==========================================================
- ' --- Final Message & Cleanup ---
- MsgBox "Extraction complete! " & urlRange.Count & " URLs processed and formatted into a table.", vbInformation
- Set http = Nothing
- Set urlSheet = Nothing
- Set dataSheet = Nothing
- Set configSheet = Nothing
- Set urlCell = Nothing
- Set urlRange = Nothing
- Set dataRange = Nothing
- Set newTable = Nothing
- End Sub
- ' ===================================================================
- ' Helper Function to Extract ALL Occurrences of a Meta Tag's Content
- ' ===================================================================
- Private Function GetAllMetaContent(ByVal textToSearch As String, ByVal metaName As String) As String
- Dim regex As Object, allMatches As Object, singleMatch As Object
- Dim results As String, pattern As String
- Set regex = CreateObject("VBScript.RegExp")
- pattern = "<meta\s+name=""" & metaName & """.*?content=""([^""]*)"""
- With regex
- .Global = True
- .IgnoreCase = True
- .pattern = pattern
- End With
- Set allMatches = regex.Execute(textToSearch)
- If allMatches.Count > 0 Then
- For Each singleMatch In allMatches
- results = results & singleMatch.SubMatches(0) & "; "
- Next singleMatch
- GetAllMetaContent = Left(results, Len(results) - 2)
- Else
- GetAllMetaContent = "Not Found"
- End If
- Set regex = Nothing
- Set allMatches = Nothing
- Set singleMatch = Nothing
- End Function
Advertisement
Add Comment
Please, Sign In to add comment