Advertisement
ShadowTzu

Html Extractor

Nov 22nd, 2014
609
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
VB.NET 7.88 KB | None | 0 0
  1. 'Coded by ShadowTzu
  2. 'Free to use
  3.  
  4. 'my 3D engine, Tzu3D: http://shadowtzu.free.fr
  5. 'Youtube: https://www.youtube.com/user/shadowtzu
  6. 'Facebook: https://www.facebook.com/Tzu3d
  7. 'Twitter: https://twitter.com/shadowtzu
  8. 'Twitch: http://www.twitch.tv/shadowtzu
  9. 'Website: http://tzu3d.weebly.com
  10.  
  11. Imports System.Net
  12.  
  13. Public Class Extractor
  14.     Implements IDisposable
  15.  
  16. #Region "Structure"
  17.     Public Structure struct_Element
  18.         Friend mtext As String
  19.         Friend mAttributes_Name() As String
  20.         Friend mAttributes_Value() As String
  21.         Public ReadOnly Property InnerText As String
  22.             Get
  23.                 Return mtext
  24.             End Get
  25.         End Property
  26.         Public ReadOnly Property Attribut(Name As String) As String
  27.             Get
  28.                 If mAttributes_Name Is Nothing Then Return Nothing
  29.                 For i As Integer = 0 To mAttributes_Name.Length - 1
  30.                     If mAttributes_Name(i) = Name.ToLower Then Return mAttributes_Value(i)
  31.                 Next
  32.                 Return Nothing
  33.             End Get
  34.         End Property
  35.  
  36.         Public Function Element(balise As String, Index As Integer) As struct_Element
  37.             Return Create_element(balise, Index, mtext)
  38.         End Function
  39.     End Structure
  40.  
  41.     Private Structure struct_Balise
  42.         Public Name As String
  43.         Public Attributes As String
  44.         Public Value As String
  45.         Public Start As Integer
  46.         Public Length As Integer
  47.     End Structure
  48. #End Region
  49.  
  50.     Private mDataPage As String
  51.     Private Web As Net.WebClient
  52.  
  53. #Region "Constructeur"
  54.     Public Sub New()
  55.         Web = New WebClient
  56.     Web.Proxy = Nothing
  57.         Web.Encoding = System.Text.Encoding.UTF8
  58.  
  59.         'Example:
  60.         'Dim ExtractData As New Extractor
  61.         'ExtractData.Load("http://www.youtube.com/results?search_query=telemann")
  62.         'Dim elem As Extractor.struct_Element = ExtractData.Element("ol", 1)
  63.         'Dim elemUrl As Extractor.struct_Element = elem.Element("a", 1)
  64.         'MsgBox("http://www.youtube.com" & elemUrl.Attribut("href") & " = " & elemUrl.InnerText & vbCrLf)
  65.     End Sub
  66. #End Region
  67.  
  68. #Region "Load"
  69.     Public Sub Load(url As String)
  70.         Try
  71.             mDataPage = Web.DownloadString(url)
  72.         Catch
  73.         End Try
  74.     End Sub
  75.  
  76.     Public Sub Load_FromString(Data As String)
  77.         mDataPage = Data
  78.     End Sub
  79. #End Region
  80.  
  81. #Region "Public"
  82.     Public Function Element(balise As String, Index As Integer) As struct_Element
  83.         Return Create_element(balise, Index, mDataPage)
  84.     End Function
  85. #End Region
  86.  
  87. #Region "Private"
  88.  
  89.     ''' <summary>
  90.     ''' Créé un element à partir des données extraite
  91.     ''' </summary>
  92.     ''' <param name="balise"></param>
  93.     ''' <param name="index"></param>
  94.     ''' <param name="data"></param>
  95.     ''' <returns></returns>
  96.     ''' <remarks></remarks>
  97.     Private Shared Function Create_element(balise As String, index As Integer, data As String) As struct_Element
  98.         data = data.Replace("&quot;", """")
  99.         Dim myElement As struct_Element = Nothing
  100.  
  101.         Dim start_index As Integer
  102.         Dim myBalise As struct_Balise = Nothing
  103.  
  104.         For i As Integer = 0 To index
  105.             myBalise = Search(balise, start_index, data)
  106.             start_index = myBalise.Start + myBalise.Length
  107.         Next i
  108.         If myBalise.Attributes <> "" Then
  109.             Dim dec_attrib() As String = Split(myBalise.Attributes, """")
  110.             ReDim myElement.mAttributes_Name(dec_attrib.Length \ 2)
  111.             ReDim myElement.mAttributes_Value(myElement.mAttributes_Name.Length - 1)
  112.             Dim t As Integer = 0
  113.             For i As Integer = 0 To myElement.mAttributes_Name.Length - 2
  114.                 myElement.mAttributes_Name(i) = ClearString(dec_attrib(t)).Replace("=", "")
  115.                 myElement.mAttributes_Value(i) = ClearString(dec_attrib(t + 1))
  116.                 t += 2
  117.             Next i
  118.         End If
  119.         myElement.mtext = myBalise.Value
  120.  
  121.         Return myElement
  122.     End Function
  123.  
  124.     ''' <summary>
  125.     ''' Recherche une balise et récupère tout ce qu'elle contient
  126.     ''' </summary>
  127.     ''' <param name="Balise"></param>
  128.     ''' <param name="Start_Index"></param>
  129.     ''' <param name="data"></param>
  130.     ''' <returns></returns>
  131.     ''' <remarks></remarks>
  132.     Private Shared Function Search(Balise As String, Start_Index As Integer, data As String) As struct_Balise
  133.         Dim myBalise As struct_Balise = Extract_Attrib(Balise, data, Start_Index)
  134.         Extract_Value(myBalise, data)
  135.         myBalise.Value = ClearString(myBalise.Value)
  136.         Return myBalise
  137.     End Function
  138.  
  139.     ''' <summary>
  140.     ''' extrait les attributs d'une balise
  141.     ''' </summary>
  142.     ''' <param name="balise"></param>
  143.     ''' <param name="data"></param>
  144.     ''' <param name="Start_Index"></param>
  145.     ''' <returns></returns>
  146.     ''' <remarks></remarks>
  147.     Private Shared Function Extract_Attrib(balise As String, data As String, Start_Index As Integer) As struct_Balise
  148.         Dim Data_start As Integer = data.IndexOf("<" & balise, Start_Index)
  149.         If Data_start = -1 Then Return Nothing
  150.         Dim next_str As String = data.Substring(Data_start + balise.Length + 1, 1)
  151.         Dim IsWord As Boolean
  152.  
  153.         Dim alphabet As String = "abcdefghijklmnopqrstuvwxyz"
  154.         Do While IsWord = False
  155.             IsWord = True
  156.             For j As Integer = 0 To alphabet.Length - 1
  157.                 If next_str.ToLower = alphabet(j) Then
  158.                     IsWord = False
  159.                     Data_start = data.IndexOf("<" & balise, Data_start + balise.Length + 1)
  160.                     If Data_start = -1 Then Exit Do
  161.                     next_str = data.Substring(Data_start + balise.Length + 1, 1)
  162.                     Exit For
  163.                 End If
  164.             Next
  165.         Loop
  166.         If Data_start = -1 Then Return Nothing
  167.  
  168.         Dim data_end As Integer = data.IndexOf(">", Data_start + balise.Length + 1)
  169.         If data_end = -1 Then Return Nothing
  170.  
  171.         Dim myBalise As struct_Balise = Nothing
  172.         myBalise.Attributes = data.Substring(Data_start + balise.Length + 1, data_end - Data_start - balise.Length - 1)
  173.  
  174.         myBalise.Start = Data_start
  175.         myBalise.Name = balise
  176.  
  177.         Return myBalise
  178.     End Function
  179.  
  180.     ''' <summary>
  181.     ''' récupère ce que contient une balise
  182.     ''' </summary>
  183.     ''' <param name="balise"></param>
  184.     ''' <param name="data"></param>
  185.     ''' <remarks></remarks>
  186.     Private Shared Sub Extract_Value(ByRef balise As struct_Balise, data As String)
  187.         If balise.Name = "" Then Exit Sub
  188.         Dim ends As Integer = data.IndexOf(balise.Name & ">", balise.Start + balise.Name.Length + 1 + balise.Attributes.Length + 1)
  189.         Dim sub_start, sub_length As Integer
  190.         sub_start = balise.Start + balise.Name.Length + 1 + balise.Attributes.Length + 1
  191.         sub_length = ends - sub_start
  192.         balise.Value = data.Substring(sub_start, sub_length - 2)
  193.         balise.Length = sub_length
  194.     End Sub
  195.  
  196.     ''' <summary>
  197.     ''' enlève les retours chariot
  198.     ''' </summary>
  199.     ''' <param name="text"></param>
  200.     ''' <returns></returns>
  201.     ''' <remarks></remarks>
  202.     Private Shared Function ClearString(text As String) As String
  203.         If text = "" Then Return ""
  204.         Return Trim(text.Replace(vbCrLf, "").Replace(vbLf, ""))
  205.     End Function
  206.  
  207. #End Region
  208.  
  209. #Region "Destructeur"
  210.     Private disposedValue As Boolean
  211.  
  212.     ' IDisposable
  213.     Protected Overridable Sub Dispose(disposing As Boolean)
  214.         If Not Me.disposedValue Then
  215.             If disposing Then
  216.                 Me.Web.Dispose()
  217.                 Me.Web = Nothing
  218.             End If
  219.             mDataPage = Nothing
  220.         End If
  221.         Me.disposedValue = True
  222.     End Sub
  223.  
  224.     Public Sub Dispose() Implements IDisposable.Dispose
  225.         Dispose(True)
  226.         GC.SuppressFinalize(Me)
  227.     End Sub
  228. #End Region
  229.  
  230. End Class
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement