Skip to content Skip to sidebar Skip to footer

Excel VBA Web Scraping Returning Wrong Text In MSXML2.XMLHTTP Method

I am trying to extract the movie description from this Url, 'https://ssl.ofdb.de/plot/138627,271359,I-Am-Legend' When i use CreateObject('InternetExplorer.Application') method it g

Solution 1:

You want to attain UTF-8 from byte string returned rather than unicode. You can use helper functions as shown below which I have taken from here. This is the 64 bit version. I will leave the 32 bit at the bottom. You can also use a more targeted css selector to obtain your node; this will be quicker and avoid additional string cleaning function calls.

Option Explicit


''' Maps a character string to a UTF-16 (wide character) string
Private Declare PtrSafe Function MultiByteToWideChar Lib "kernel32" ( _
    ByVal CodePage As Long, _
    ByVal dwFlags As Long, _
    ByVal lpMultiByteStr As LongPtr, _
    ByVal cchMultiByte As Long, _
    ByVal lpWideCharStr As LongPtr, _
    ByVal cchWideChar As Long _
    ) As Long
' CodePage constant for UTF-8
Private Const CP_UTF8 = 65001

''' Return length of byte array or zero if uninitialized
Private Function BytesLength(abBytes() As Byte) As Long
    ' Trap error if array is uninitialized
    On Error Resume Next
    BytesLength = UBound(abBytes) - LBound(abBytes) + 1
End Function

''' Return VBA "Unicode" string from byte array encoded in UTF-8
Public Function Utf8BytesToString(abUtf8Array() As Byte) As String
    Dim nBytes As Long
    Dim nChars As Long
    Dim strOut As String
    Utf8BytesToString = ""
    ' Catch uninitialized input array
    nBytes = BytesLength(abUtf8Array)
    If nBytes <= 0 Then Exit Function
    ' Get number of characters in output string
    nChars = MultiByteToWideChar(CP_UTF8, 0&, VarPtr(abUtf8Array(0)), nBytes, 0&, 0&)
    ' Dimension output buffer to receive string
    strOut = String(nChars, 0)
    nChars = MultiByteToWideChar(CP_UTF8, 0&, VarPtr(abUtf8Array(0)), nBytes, StrPtr(strOut), nChars)
    Utf8BytesToString = Left$(strOut, nChars)
End Function

Public Sub test()

    Dim xhr As MSXML2.XMLHTTP60: Set xhr = New MSXML2.XMLHTTP60
    Dim html As MSHTML.HTMLDocument: Set html = New MSHTML.HTMLDocument

    With xhr
        .Open "GET", "https://ssl.ofdb.de/plot/138627,271359,I-Am-Legend", False
        .send
         html.body.innerHTML = Utf8BytesToString(.responseBody)
    End With

    [A1] = html.querySelector("p.Blocksatz").innerText
 
End Sub

32-bit:

Private Declare Function MultiByteToWideChar Lib "kernel32" ( _
    ByVal CodePage As Long, _
    ByVal dwFlags As Long, _
    ByVal lpMultiByteStr As Long, _
    ByVal cchMultiByte As Long, _
    ByVal lpWideCharStr As Long, _
    ByVal cchWideChar As Long _
    ) As Long

Post a Comment for "Excel VBA Web Scraping Returning Wrong Text In MSXML2.XMLHTTP Method"