Vielleicht braucht mal jemand soetwas.
Ich wollte mir das Ziehen von Infos aus Webseiten etwas vereinfachen und habe mir eine Funktion geschrieben mit der man Elemente aus HTML-Strings mit Hilfe von CSS-Selektoren filtern kann.
Aktuell habe ich nur die ID, Class und Attribut-Selektoren implementiert aber das sind ja auch erst einmal die wichtigsten.
Ansonsten ist es sicherlich noch bisschen buggy.
Hier mal ein Beispiel wie man damit den aktuellsten Thread hier auf der Seite ermitteln könnte:
Spoiler anzeigen
AutoIt
#include-once
#include <array.au3>
Local $sHtml = BinaryToString(InetRead("https://autoit.de/"), 4) ; get the raw source
Local $a_Elements = _CssSelectFromString($sHtml, "div#portalBox2 tr.wbbThread:first a:first")
If Not @error Then MsgBox(0, "", "Aktuellster Thread im Forum: """ & $a_Elements[0][2] & '"')
; #FUNCTION# ======================================================================================
; Name ..........: _CssSelectFromString()
; Description ...: extract elements out of a html string with css-style selectors
; Syntax ........: _CssSelectFromString($s_String, $s_Selector, {Const $b_First = True})
; Parameters ....: $s_String - the html string
; $s_Selector - the css-style selector
; {$b_First} - dont't touch! - for internal use only!
; Return values .: Success: Array[...][0] = element attributes string
; Array[...][1] = element content string
; Failure: ""
; @error = 1: no balanced elements found
; Remarks .......: The following CSS-Selector-Features are implemented:
; .class - class of a element
; #ID - ID of a element
; [attribute] - attribute exist
; [attribute="value"] - elements with attribute=value
; [attribute^="value"] - elements where attribute starting with value
; [attribute~="value"] - elements where attribute containing value
; [attribute$="value"] - elements where attribute ending with value
; :first - only the first element gets returned
; :nth-of-type(<number>) - match only the nth match of the element
; :has(<Selector>) - elements which contain at least one element that matches the specified selector
; :nothas(<Selector>) - elements which not contain a element that matches the specified selector
; Known bugs ....: - If Balancing is not correct in the string - the result is also not correct
; - If selector is too general - some result may get doubled - to solve this: be as exactly as possible
; Author ........: AspirinJunkie
; Version .......: 0.8.1
; =================================================================================================
Func _CssSelectFromString($s_String, $s_Selector, Const $b_First = True, Const $b_SubCheck = False)
Local $s_Type = ""
Local $a_RegEx, $ID, $i, $j, $a_Ret, $s_Attrs
Local Static $o_Result
Local $d_RetCounter = 0
If $b_First Then
$s_String = StringRegExpReplace($s_String, "(?s)(?x)(?:<!--.+?)(?R)?(?:-->)", "") ; Kommentare killen
$s_String = StringRegExpReplace($s_String, "(?s)(?x)(?:<script.+?)(?R)?(?:</script)", "") ; Script-Elemente killen (Inhalt könnte Probleme beim Parsen bringen)
$s_String = StringRegExpReplace($s_String, "(?s)^.+<body\b(.+?)</body>.+$", "$1") ; Nur Inhalt des Bodys verwenden
$o_Result = ObjCreate('Scripting.Dictionary')
EndIf
$s_Selector = StringStripWS($s_Selector, 7)
Local $s_CurrentSelector = StringRegExp($s_Selector, '^((?:"\s*[^"]+\s*[^"]+"|\(((?:.|(?R))*)\)|\S)+)', 1)[0]
Local $s_SelectorRest = StringStripWS(StringTrimLeft($s_Selector, @extended), 1)
; Get the element type:
$a_RegEx = StringRegExp($s_CurrentSelector, "^(\w*)", 1)
If @error Then Return SetError(1, @error, "")
$s_Type = $a_RegEx[0]
; Alle Typen durchgehen und auf Übereinstimmung mit weiteren KLassifikatoren prüfen
Local $a_Types = _HtmlElements($s_String, $s_Type, True)
Local $a_HasProperty = StringRegExp($s_CurrentSelector, ":has\(\s*((?:.|(?R))*)\)", 3)
If @error Then Local $a_HasProperty[0]
Local $a_nothasProperty = StringRegExp($s_CurrentSelector, ":nothas\(\s*((?:.|(?R))*)\)", 3)
If @error Then Local $a_nothasProperty[0]
$s_CurrentSelector = StringRegExpReplace($s_CurrentSelector, ":(has|nothas)\(\s*((?:.|(?R))*)\)", "") ; delete :has and :nothas-Selectors because the inner Selector shouldn't processed by the following Selectors
Local $a_AttribOpt = StringRegExp($s_CurrentSelector, "\[(\w+)\s*([~\*|\^\$]?=)\s*['""](.+?)['""]\s*\]", 4)
If @error Then Local $a_AttribOpt[0]
Local $a_ClassesOpt = StringRegExp($s_CurrentSelector, "\.(\w+)", 3)
If @error Then Local $a_ClassesOpt[0]
Local $a_IDOpt = StringRegExp($s_CurrentSelector, "#(\w+)", 3)
If @error Then Local $a_IDOpt[0]
Local $b_EmptyProperty = StringRegExp($s_CurrentSelector, ":empty\b", 0)
$a_RegEx = StringRegExp($s_CurrentSelector, ":nth-of-type\((\d+)\)", 1)
Local $d_NthOfType = @error ? 0 : $a_RegEx[0]
For $j = 0 To UBound($a_Types) - 1
$s_Attrs = $a_Types[$j][1]
; Check for ID:
For $ID In $a_IDOpt
If Not StringRegExp($s_Attrs, "(?x)\bid=['""]\b" & $ID & "\b['""]") Then ContinueLoop 2
Next
; Check for class:
For $Class In $a_ClassesOpt
If Not StringRegExp($s_Attrs, "(?x)\bclass=['""][^'"">]*?\b" & $Class & "\b[^'"">]*?['""](?x)") Then ContinueLoop 2
Next
; Check for attributes:
; check for exist:
$a_RegEx = StringRegExp($s_CurrentSelector, "\[(\w+)\s*\]", 3)
If Not @error Then
For $Attr In $a_RegEx
If Not StringRegExp($s_Attrs, "(?x)\b" & $Attr & "\b[=\s$]") Then ContinueLoop 2
Next
EndIf
For $AttrVal In $a_AttribOpt
Switch $AttrVal[2]
Case "=" ; full match
If Not StringRegExp($s_Attrs, "(?x)" & $AttrVal[1] & "=['""]" & $AttrVal[3] & "['""]") Then ContinueLoop 2
Case "~=", "*=" ; partial match
If Not StringRegExp($s_Attrs, "(?x)" & $AttrVal[1] & "=['""][^'""]*" & $AttrVal[3] & "[^'""]*['""]") Then ContinueLoop 2
Case "|=", "^=" ; start match
If Not StringRegExp($s_Attrs, "(?x)" & $AttrVal[1] & "=['""]" & $AttrVal[3] & "[^'""]*['""]") Then ContinueLoop 2
Case "$=" ; end match
If Not StringRegExp($s_Attrs, "(?x)" & $AttrVal[1] & "=['""][^'""]*" & $AttrVal[3] & "['""]") Then ContinueLoop 2
EndSwitch
Next
;has(<Selector>)
For $HasSelector In $a_HasProperty
If Not _CssSelectFromString(StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4]), $HasSelector, False, True) Then ContinueLoop 2
Next
;nothas(<Selector>)
For $NotSelector In $a_nothasProperty
If _CssSelectFromString(StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4]), $NotSelector, False, True) Then ContinueLoop 2
Next
;:empty
If $b_EmptyProperty Then
If Not StringIsSpace(StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4])) Then ContinueLoop
EndIf
;:nth-of-type()
If $d_NthOfType Then
$d_RetCounter += 1
If $d_RetCounter <> $d_NthOfType Then ContinueLoop
EndIf
; save result element
If $s_SelectorRest = "" Then
If $b_SubCheck Then Return True
Local $a_T[3] = [$a_Types[$j][0], $a_Types[$j][1], StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4])]
$o_Result($o_Result.Count) = $a_T
Else
If $b_SubCheck Then
Return _CssSelectFromString(StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4]), $s_SelectorRest, False, True)
Else
_CssSelectFromString(StringMid($s_String, $a_Types[$j][4], $a_Types[$j][5] - $a_Types[$j][4]), $s_SelectorRest, False)
EndIf
EndIf
; check for ":first" selector
If StringInStr($s_CurrentSelector, ":first", 2) Then ExitLoop
If $d_NthOfType And $d_RetCounter = $d_NthOfType Then ExitLoop ; :nth-of-type() - break because following elements could'nt match
Next
If $b_SubCheck Then Return False ; if recursion is inside a :has or :nothas-Selector
If $b_First Then
Local $n = $o_Result.Count
Local $a_Return[$n][3]
Local $v = 0
For $i In $o_Result.Items()
$a_Return[$v][0] = StringStripWS($i[0], 3)
$a_Return[$v][1] = StringStripWS($i[1], 3)
$a_Return[$v][2] = StringStripWS($i[2], 3)
$v += 1
Next
Return $a_Return
EndIf
EndFunc ;==>_CssSelectFromString
; #FUNCTION# ======================================================================================
; Name ..........: _HtmlElements()
; Description ...: extract all elements of type $s_Type from a html string
; Syntax ........: _HtmlElements(ByRef $s_Html, Const $s_Type)
; Parameters ....: $s_Html - the html string
; $s_Type - the html element type which should to extract
; $bN - False: returns 4-column array with element content as a string
; - True: returns 6-column array with substring positions of
; element and it's content inside of $s_Html
; Return values .: Success: Array[...][0] = element attributes string
; Array[...][1] = element content string
; Failure: ""
; @error = 1: no balanced elements found
; Known bugs ....: If Balancing is not correct in the string - the result is also not correct
; Author ........: AspirinJunkie
; =================================================================================================
Func _HtmlElements(ByRef $s_Html, Const $s_Type = "", Const $bN = False)
Local $d_Start, $d_End, $d_T, $d_Pos = 1, $x = 0, $a_Element
Local $s_Begin = "<" & $s_Type, $s_Ende = "</" & $s_Type
Local $nB = StringLen($s_Begin)
Local $oVoids = ObjCreate("Scripting.Dictionary") ; replace later with AutoIt-Maps
Local $s_Ends = "", $s_Begins = ""
Local $s_VoidEnds = "", $s_VoidBegins = ""
Local $dOff, $a_T, $s_ElType, $s_T
Local Const $s_REstart = ($s_Type = "") ? "(<\w+\b)" : "(<" & $s_Type & "\b)"
Local Const $s_REend = ($s_Type = "") ? "(</\w+\b)" : "(</" & $s_Type & "\b)"
; Alle Tag-Anfänge ermitteln:
Do
$a_T = StringRegExp($s_Html, $s_REstart, 1, $d_Pos)
$dOff = @extended
If $dOff = 0 Then ExitLoop
$d_T = $dOff - StringLen($a_T[0])
; void element form (e.g: <link href="fancy.css" type="text/css" /> )
$a_T = StringRegExp($s_Html, "\G<(area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr)\s*([^<]*?)/?>", 1, $d_T)
$dOff = @extended
If $dOff Then
Local $aTemp[4] = [$a_T[0], $a_T[1], $d_T, $dOff]
$d_Pos = $dOff
$oVoids($d_T) = $aTemp
Else ; normal elements
$s_Begins &= $d_T & "|"
$d_Pos = $d_T + $nB
EndIf
$x += 1
Until 0
; Alle Tag-Enden ermitteln:
$d_Pos = 1
Do
$a_T = StringRegExp($s_Html, $s_REend, 1, $d_Pos)
$dOff = @extended
If $dOff = 0 Then ExitLoop
$s_Ends &= $dOff - StringLen($a_T[0]) & "|"
$d_Pos = $dOff
Until 0
If $s_Begins <> "" And $s_Ends <> "" Then
Local $a_Starts = StringSplit(StringTrimRight($s_Begins, 1), "|", 2)
Local $a_Ends = StringSplit(StringTrimRight($s_Ends, 1), "|", 2)
For $i = 0 To UBound($a_Starts) - 1
$a_Starts[$i] = Int($a_Starts[$i])
Next
For $i = 0 To UBound($a_Ends) - 1
$a_Ends[$i] = Int($a_Ends[$i])
Next
; Funde matchen
Local $dMax = UBound($a_Ends) > UBound($a_Starts) ? UBound($a_Starts) : UBound($a_Ends)
Local $a_Return[$dMax + $oVoids.Count][($bN ? 6 : 4)]
If $dMax > 0 Then
Local $d_End
For $i = 0 To UBound($a_Ends) - 1
$d_End = $a_Ends[$i]
For $j = UBound($a_Starts) - 1 To 0 Step -1
$d_Start = $a_Starts[$j]
If $d_Start = 1000000000 Then ContinueLoop
If $d_Start < $d_End Then
If $bN Then
$a_Element = StringRegExp($s_Html, "(?s)\G<(\w+)\b\s?(.*?)>", 1, $d_Start)
$dOff = @extended
If UBound($a_Element) < 2 Then ContinueLoop
$a_Return[$i][0] = $a_Element[0]
$a_Return[$i][1] = $a_Element[1]
$a_Return[$i][2] = Int($d_Start)
$a_Return[$i][3] = Int($d_End) + 3 + StringLen($a_Element[0])
$a_Return[$i][4] = $dOff
$a_Return[$i][5] = Int($d_End)
Else
$a_Element = StringRegExp(StringMid($s_Html, $d_Start, $d_End - $d_Start + StringLen($s_Ende)), "(?s)^<(\w+)\b\s?(.*?)>(.*)</" & $s_Type, 3)
;~ $a_Element = StringRegExp($s_Html, "(?s)\G<" & $s_Type & "\s?(.*?)>(.*?)</" & $s_Type, 3, $d_Start)
If UBound($a_Element) < 2 Then ContinueLoop
$a_Return[$i][0] = $a_Element[0]
$a_Return[$i][1] = $a_Element[1]
$a_Return[$i][2] = $a_Element[2]
$a_Return[$i][3] = Int($d_Start)
EndIf
$a_Starts[$j] = 1000000000
ContinueLoop 2
EndIf
Next
Next
EndIf
Else
Local $a_Return, $dMax = 0
EndIf
; add the void elements to the result array
If Not IsArray($a_Return) Then Local $a_Return[$oVoids.Count][($bN ? 6 : 4)]
Local $c = 0
For $i In $oVoids.Items
$a_Return[$dMax + $c][0] = $i[0]
$a_Return[$dMax + $c][1] = $i[1]
If $bN Then
$a_Return[$dMax + $c][2] = $i[2]
$a_Return[$dMax + $c][3] = $i[3]
Else
$a_Return[$dMax + $c][3] = $i[2]
EndIf
$c += 1
Next
_ArraySort($a_Return, 0, 0, 0, ($bN ? 2 : 3))
Return $a_Return
EndFunc ;==>_HtmlElements
Alles anzeigen