Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
762 views
in Technique[技术] by (71.8m points)

excel - Using post data and xlmlhttp

I am still trying to learn more about scraping and I could devise a code that enables me to get the desired results.

Here's the code

Sub Test()
    Dim e          As Variant
    Dim ie          As Object
    Dim ulElem      As Object
    Dim liElem      As Object
    Dim anchElem    As Object
    Dim dt          As Date
    Dim lDay        As Integer
    Dim lMnth      As Integer
    Dim lYear      As Integer
    Dim r          As Long

    Set ie = CreateObject("InternetExplorer.Application")
    dt = Date - 2
    lDay = Day(dt)
    lMnth = Month(dt)
    lYear = Year(dt)

    With ie
        .Visible = False
        .Navigate "http://www.handelsregisterbekanntmachungen.de/?aktion=suche#Ergebnis"

        Do: DoEvents: Loop Until .readyState = 4

        For Each e In ie.document.getElementsByTagName("select")
            If Len(e.innerText) = 56 Then
                e.selectedIndex = lDay
            ElseIf Len(e.innerText) = 18 Then
                e.selectedIndex = lMnth
            ElseIf Left(e.innerText, 8) = "----2000" Then
                e.selectedIndex = lYear - 1999
            ElseIf InStr(e.innerText, "Alle Bekanntmachungen") > 0 Then
                e.selectedIndex = 1
            End If
        Next e

        For Each e In ie.document.getElementsByTagName("input")
            If e.Value = "Suche starten" Then e.Click: Exit For
        Next e
        Do: DoEvents: Loop Until .readyState = 4
        Application.Wait Now() + TimeValue("00:00:05")

        If InStr(ie.document.body.innerHTML, "Es wurden 0 Treffer gefunden.") > 0 Then
            MsgBox "No Results Found", vbExclamation: Exit Sub
        Else
            For Each ulElem In ie.document.getElementsByTagName("b")
                For Each liElem In ulElem.getElementsByTagName("li")
                    Set anchElem = liElem.getElementsByTagName("a")
                    If anchElem.Length > 0 Then
                        r = r + 1
                        Cells(r, 1) = Mid(anchElem.Item(0).innerText, 11)
                    End If
                Next liElem
            Next ulElem
        End If
    End With
End Sub

But as a matter of trying to learn more about XMLHTTP requests I am seeking for a way to get the same results but without using IE. so I think using XMLHTTP will be more efficient specially I could see post data after setting up the desired choices for the search process.

See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

Take a look at the below example:

Option Explicit

Sub Test()

    Dim sState As String
    Dim sCourt As String
    Dim dtFrom As Date
    Dim dtTill As Date
    Dim sSubject As String
    Dim sOrder As String
    Dim oStates As Object
    Dim oCourts As Object
    Dim oSubjects As Object
    Dim oOrders As Object
    Dim sStateCode As String
    Dim sCourtId As String
    Dim sSubjectVal As String
    Dim sOrderVal As String
    Dim aData

    ' Set query data
    sState = ""
    sCourt = ""
    dtFrom = DateSerial(2018, 2, 11)
    dtTill = DateSerial(2018, 2, 11)
    sSubject = ""
    sOrder = "Aktenzeichen"

    ' Retrieve options
    GetOptions oStates, oCourts, oSubjects, oOrders

    ' Validate query parameters
    If Not oStates.Exists(sState) Then MsgBox "State valid values:" & vbCrLf & vbCrLf & """" & Join(oStates.Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oCourts(oStates(sState)).Exists(sCourt) Then MsgBox "Court valid values:" & vbCrLf & vbCrLf & """" & Join(oCourts(oStates(sState)).Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oSubjects.Exists(sSubject) Then MsgBox "Subject valid values:" & vbCrLf & vbCrLf & """" & Join(oSubjects.Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oOrders.Exists(sOrder) Then MsgBox "Order valid values:" & vbCrLf & vbCrLf & """" & Join(oOrders.Keys(), """" & vbCrLf & """") & """": Exit Sub

    ' Request data
    sStateCode = oStates(sState)
    sCourtId = oCourts(sStateCode)(sCourt)
    sSubjectVal = oSubjects(sSubject)
    sOrderVal = oOrders(sOrder)
    GetData sStateCode, sCourt, sCourtId, dtFrom, dtTill, sSubjectVal, sOrderVal, aData

    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        Output2DArray .Cells(1, 1), aData
        .Columns.AutoFit
    End With

    MsgBox "Completed"

End Sub

Sub GetOptions(oStates As Object, oCourts As Object, oSubjects As Object, oOrders As Object)

    Dim sContent As String
    Dim aTmp0
    Dim aTmp1
    Dim vItem
    Dim oTmp
    Dim i As Long

    ' Retrieve request options from search page
    XmlHttpRequest "GET", "https://www.handelsregisterbekanntmachungen.de/?aktion=suche", "", "", "", sContent

    ' Get each state and code
    ExtractOptions sContent, "land", oStates

    ' Get courts with courts ids for each federal state
    Set oCourts = CreateObject("Scripting.Dictionary")
    For Each vItem In oStates.Items()
        ' Put courts and ids into temp dictionary
        Set oTmp = CreateObject("Scripting.Dictionary")
        If vItem <> "" Then
            ' Extract the whole JS array statement with courts names
            ParseResponse "BundeslandArray['" & vItem & "']=new Array(('[^']*'(?:,'[^']*')*));", sContent, aTmp0, False
            ' Extract each court name into temp array
            ParseResponse "'([^']*)'", (aTmp0(0)), aTmp0, False
            ' Extract the whole JS array statement with courts ids
            ParseResponse "BundeslandArrayId['" & vItem & "']=new Array(('[^']*'(?:,'[^']*')*));", sContent, aTmp1, False
            ' Extract each court id into temp array
            ParseResponse "'([^']*)'", (aTmp1(0)), aTmp1, False
            For i = 0 To UBound(aTmp0)
                oTmp(DecodeHTMLEntities((aTmp0(i)))) = DecodeHTMLEntities((aTmp1(i)))
            Next
        End If
        ' Add dummy item
        oTmp("") = ""
        ' Put courts-ids for the state code into dictionary
        Set oCourts(vItem) = oTmp
    Next
    ' Add dummy item
    oStates("") = ""

    ' Get subjects
    ExtractOptions sContent, "gegenstand", oSubjects
    ' Add dummy item
    oSubjects("") = "0"

    ' Get sort order types
    ExtractOptions sContent, "order", oOrders

End Sub

Sub GetData(sStateCode As String, sCourt As String, sCourtId As String, dtFrom As Date, dtTill As Date, sSubjectVal As String, sOrderVal As String, aData)

    Dim i As Long
    Dim oQuery As Object
    Dim sQuery As String
    Dim sContent As String

    ' Set query parameters
    Set oQuery = CreateObject("Scripting.Dictionary")
    With oQuery
        .Add "suchart", "uneingeschr"
        .Add "button", "Start search"
        .Add "land", sStateCode
        .Add "gericht", sCourtId
        .Add "gericht_name", sCourt
        .Add "seite", ""
        .Add "l", ""
        .Add "r", ""
        .Add "all", "false"
        .Add "vt", Day(dtFrom)
        .Add "vm", Month(dtFrom)
        .Add "vj", Year(dtFrom)
        .Add "bt", Day(dtTill)
        .Add "bm", Month(dtTill)
        .Add "bj", Year(dtTill)
        .Add "fname", ""
        .Add "fsitz", ""
        .Add "rubrik", ""
        .Add "az", ""
        .Add "gegenstand", sSubjectVal
        .Add "anzv", "alle"
        .Add "order", sOrderVal
    End With
    sQuery = EncodeQueryParams(oQuery)

    ' Retrieve search results
    XmlHttpRequest "POST", _
        "https://www.handelsregisterbekanntmachungen.de/de/index.php?aktion=suche", _
        Array( _
            Array("Content-Type", "application/x-www-form-urlencoded"), _
            Array("Content-Length", Len(sQuery) _
            ) _
        ), _
        sQuery, _
        "", _
        sContent

    ' Parse response
    sContent = Replace(sContent, "<br>", vbCrLf)
    ParseResponse "<li[^>]*><a[^>]*?href=""javascript:NeuFenster('([^']*)')""[^>]*>([^<]*)<ul[^>]*>([sS]*?)</ul>", sContent, aData, False
    For i = 0 To UBound(aData, 1)
        aData(i)(0) = "http://www.handelsregisterbekanntmachungen.de/en/skripte/hrb.php?" & aData(i)(0)
    Next

End Sub

Sub ExtractOptions(sContent As String, sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=" & sName & "[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(DecodeHTMLEntities((vItem(1)))) = DecodeHTMLEntities(Replace(vItem(0), """", ""))
    Next

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    ' With CreateObject("MSXML2.ServerXMLHTTP")
        ' .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp0 = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp0, sSubMatch
                Next
                PushItem aData, aTmp0
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function DecodeHTMLEntities(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    DecodeHTMLEntities = oDiv.innerText

End Function

Function EncodeQueryParams(oParams As Object) As String

    Dim aParams
    Dim i As Long

    aParams = oParams.Keys()
    For i = 0 To UBound(aParams)
        aParams(i) = EncodeUriComponent((aParams(i))) & "=" & EncodeUriComponent((oParams(aParams(i))))
    Next
    EncodeQueryParams = Join(aParams, "&")

End Function

Function EncodeUriComponent(strText As String) As String

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
    End If
    EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...