Guest User

VBA to extract website data

a guest
Jul 3rd, 2025
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
VBScript 7.21 KB | Source Code | 0 0
  1. ' ===================================================================
  2. '  VBA Subroutine to Dynamically Extract Meta Tag Data from URLs
  3. '  - Reads required fields from a "Config" worksheet.
  4. '  - Correctly handles and joins repeating fields.
  5. '  - Formats the final output as a named Excel Table.
  6. '  - FINAL VERSION 3.0
  7. ' ===================================================================
  8. ' excel set up instructions.
  9. ' ===================================================================
  10. ' right click html page and view source
  11. ' copy the meta tags and paste into excel
  12. ' text to columns with " delimiter - that gives you all the fields
  13. ' put a header at the top of column A "whatever"
  14. ' pivot table on that col, copy paste the pivot table as values - that gives you the unique names as citation_author repeat
  15. '
  16. ' Create Worksheets
  17. ' - URLs
  18. ' - Journal Data
  19. ' - Config
  20. '
  21. ' Paste URLs and Config values in Col A starting from Row 2
  22. ' ===================================================================
  23.  
  24. Sub ExtractJournalMetaData_WithTableFormatting()
  25.  
  26.     ' --- Variable Declarations ---
  27.    Dim http As Object
  28.     Dim urlSheet As Worksheet, dataSheet As Worksheet, configSheet As Worksheet
  29.     Dim urlCell As Range, urlRange As Range, dataRange As Range
  30.     Dim newTable As ListObject
  31.     Dim fieldsToExtract As Variant
  32.     Dim lastRow As Long, lastConfigRow As Long, currentRow As Long, i As Long, j As Long
  33.  
  34.     ' --- Setup: Define Worksheets ---
  35.    On Error Resume Next
  36.     Set urlSheet = ThisWorkbook.Sheets("URLs")
  37.     Set dataSheet = ThisWorkbook.Sheets("Journal Data")
  38.     Set configSheet = ThisWorkbook.Sheets("Config")
  39.     If urlSheet Is Nothing Or configSheet Is Nothing Then
  40.         MsgBox "Error: Please ensure you have both a 'URLs' sheet and a 'Config' sheet.", vbCritical
  41.         Exit Sub
  42.     End If
  43.     If dataSheet Is Nothing Then
  44.         Set dataSheet = ThisWorkbook.Sheets.Add(After:=urlSheet)
  45.         dataSheet.Name = "Journal Data"
  46.     End If
  47.     On Error GoTo 0
  48.  
  49.     ' --- Read Configuration ---
  50.    lastConfigRow = configSheet.Cells(configSheet.Rows.Count, "A").End(xlUp).Row
  51.     If lastConfigRow < 2 Then
  52.         MsgBox "The 'Config' sheet is empty. Please list fields to extract in column A, starting at A2.", vbExclamation
  53.         Exit Sub
  54.     End If
  55.     fieldsToExtract = configSheet.Range("A2:A" & lastConfigRow).Value
  56.  
  57.     ' --- Setup: Prepare Data Sheet ---
  58.    dataSheet.Cells.ClearContents
  59.     dataSheet.Range("A1").Value = "URL"
  60.     For i = 1 To UBound(fieldsToExtract, 1)
  61.         dataSheet.Cells(1, i + 1).Value = fieldsToExtract(i, 1)
  62.     Next i
  63.     dataSheet.Rows(1).Font.Bold = True
  64.  
  65.     ' --- Input Validation ---
  66.    lastRow = urlSheet.Cells(urlSheet.Rows.Count, "A").End(xlUp).Row
  67.     If lastRow < 2 Then
  68.         MsgBox "No URLs found in the 'URLs' sheet. Please paste them starting from cell A2.", vbInformation
  69.         Exit Sub
  70.     End If
  71.     Set urlRange = urlSheet.Range("A2:A" & lastRow)
  72.  
  73.     ' --- Main Processing Loop ---
  74.    Set http = CreateObject("MSXML2.XMLHTTP.6.0")
  75.     currentRow = 2
  76.  
  77.     For Each urlCell In urlRange.Cells
  78.         url = Trim(urlCell.Value)
  79.         Application.StatusBar = "Processing " & (currentRow - 1) & " of " & urlRange.Count & ": " & url
  80.         dataSheet.Cells(currentRow, 1).Value = url
  81.         If Left(url, 4) = "http" Then
  82.             On Error Resume Next
  83.             http.Open "GET", url, False
  84.             http.Send
  85.             On Error GoTo 0
  86.             If http.Status = 200 Then
  87.                 htmlContent = http.responseText
  88.                 For i = 1 To UBound(fieldsToExtract, 1)
  89.                     extractedValue = GetAllMetaContent(htmlContent, fieldsToExtract(i, 1))
  90.                     dataSheet.Cells(currentRow, i + 1).Value = extractedValue
  91.                 Next i
  92.             Else
  93.                 For i = 1 To UBound(fieldsToExtract, 1)
  94.                     dataSheet.Cells(currentRow, i + 1).Value = "Error: HTTP " & http.Status
  95.                 Next i
  96.             End If
  97.         Else
  98.             For i = 1 To UBound(fieldsToExtract, 1)
  99.                 dataSheet.Cells(currentRow, i + 1).Value = "Invalid URL"
  100.             Next i
  101.         End If
  102.         currentRow = currentRow + 1
  103.     Next urlCell
  104.    
  105.     Application.StatusBar = False
  106.  
  107.     ' ==========================================================
  108.    ' --- NEW: Final Formatting Section ---
  109.    ' ==========================================================
  110.    ' Define the full range of the data we just created
  111.    Set dataRange = dataSheet.Range("A1").CurrentRegion
  112.    
  113.     ' Delete any existing table with the same name to prevent errors
  114.    On Error Resume Next
  115.     dataSheet.ListObjects("tblJournalExtract").Delete
  116.     On Error GoTo 0
  117.    
  118.     ' 1. Convert the range into a named Excel Table
  119.    Set newTable = dataSheet.ListObjects.Add(xlSrcRange, dataRange, , xlYes)
  120.     newTable.Name = "tblJournalExtract"
  121.     newTable.TableStyle = "TableStyleMedium2" ' Optional: Apply a default style
  122.    
  123.     ' 2. Set vertical alignment and text wrapping for the whole table
  124.    With newTable.Range
  125.         .VerticalAlignment = xlTop
  126.         .WrapText = True
  127.     End With
  128.    
  129.     ' 3. Loop through each column to set widths
  130.    For j = 1 To newTable.ListColumns.Count
  131.         ' Autofit the column first to get its ideal width
  132.        newTable.ListColumns(j).Range.Columns.AutoFit
  133.        
  134.         ' If the autofitted width is more than 60, cap it at 60
  135.        If newTable.ListColumns(j).Range.Columns.ColumnWidth > 60 Then
  136.             newTable.ListColumns(j).Range.Columns.ColumnWidth = 60
  137.         End If
  138.     Next j
  139.     ' ==========================================================
  140.    ' --- End of New Formatting Section ---
  141.    ' ==========================================================
  142.  
  143.     ' --- Final Message & Cleanup ---
  144.    MsgBox "Extraction complete! " & urlRange.Count & " URLs processed and formatted into a table.", vbInformation
  145.     Set http = Nothing
  146.     Set urlSheet = Nothing
  147.     Set dataSheet = Nothing
  148.     Set configSheet = Nothing
  149.     Set urlCell = Nothing
  150.     Set urlRange = Nothing
  151.     Set dataRange = Nothing
  152.     Set newTable = Nothing
  153.  
  154. End Sub
  155.  
  156. ' ===================================================================
  157. '  Helper Function to Extract ALL Occurrences of a Meta Tag's Content
  158. ' ===================================================================
  159. Private Function GetAllMetaContent(ByVal textToSearch As String, ByVal metaName As String) As String
  160.     Dim regex As Object, allMatches As Object, singleMatch As Object
  161.     Dim results As String, pattern As String
  162.     Set regex = CreateObject("VBScript.RegExp")
  163.     pattern = "<meta\s+name=""" & metaName & """.*?content=""([^""]*)"""
  164.     With regex
  165.         .Global = True
  166.         .IgnoreCase = True
  167.         .pattern = pattern
  168.     End With
  169.     Set allMatches = regex.Execute(textToSearch)
  170.     If allMatches.Count > 0 Then
  171.         For Each singleMatch In allMatches
  172.             results = results & singleMatch.SubMatches(0) & "; "
  173.         Next singleMatch
  174.         GetAllMetaContent = Left(results, Len(results) - 2)
  175.     Else
  176.         GetAllMetaContent = "Not Found"
  177.     End If
  178.     Set regex = Nothing
  179.     Set allMatches = Nothing
  180.     Set singleMatch = Nothing
  181. End Function
  182.  
Advertisement
Add Comment
Please, Sign In to add comment