I've written some code to parse Name, Address and Phone number of different shops from yell.com. If any link is given to my crawler It parses the whole content irrespective of how many pages it has spread across. However, the only problem I could find out is that it always skips the content of first page, as in if there are 10 pages, my crawler scrapes last 9 pages. A little twitch might lead me to get a workaround. Here is the complete code. Thanks in advance.
Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, htm As New HTMLDocument
Dim post As HTMLHtmlElement, page As Object, newlink As String
With http
.Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
.send
html.body.innerHTML = .responseText
End With
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")
For i = 0 To page.Length - 2
newlink = mlink & Replace(page(i).href, "about:", "")
With http
.Open "GET", newlink, False
.send
htm.body.innerHTML = .responseText
End With
For Each post In htm.getElementsByClassName("js-LocalBusiness")
x = x + 1
With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
If .Length Then Cells(x + 1, 1) = .Item(0).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
End With
With post.getElementsByClassName("businessCapsule--tel")
If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
End With
Next post
Next i
End Sub
Here is the elements within which page-number for next page is stored:
<div class="row pagination">
<div class="col-sm-24">
<span class="pagination--page is-selected">1</span>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=2" data-tracking="DISPLAY:PAGINATION:NUMBER">2</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=3" data-tracking="DISPLAY:PAGINATION:NUMBER">3</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=4" data-tracking="DISPLAY:PAGINATION:NUMBER">4</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=5" data-tracking="DISPLAY:PAGINATION:NUMBER">5</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=6" data-tracking="DISPLAY:PAGINATION:NUMBER">6</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=7" data-tracking="DISPLAY:PAGINATION:NUMBER">7</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=8" data-tracking="DISPLAY:PAGINATION:NUMBER">8</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=9" data-tracking="DISPLAY:PAGINATION:NUMBER">9</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=10" data-tracking="DISPLAY:PAGINATION:NUMBER">10</a>
<a rel="nofollow" class="pagination--next" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=2" data-tracking="DISPLAY:PAGINATION:NEXT">Next</a>
</div>
</div>
The problem here is the fact that the very first page is selected already and so it doesn't have an anchor in the pagination. Solution would be to process the first page first and then process the remaining pages using pagination. HTH
Option Explicit
Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60
Dim html As New HTMLDocument
Dim page As Object, newlink As String
With http
.Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
.send
html.body.innerHTML = .responseText
End With
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")
Dim i, x
' First page first, is selected already, 'row pagination' doesn't have 'a' for it
GetPageData x, html
' Next pages then
Dim html2 As New HTMLDocument
For i = 0 To page.Length - 2
newlink = mlink & Replace(page(i).href, "about:", "")
With http
.Open "GET", newlink, False
.send
html2.body.innerHTML = .responseText
End With
GetPageData x, html2
Next i
End Sub
Private Sub GetPageData(ByRef x, ByRef html As HTMLDocument)
Dim post As HTMLHtmlElement
For Each post In html.getElementsByClassName("js-LocalBusiness")
x = x + 1
With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
If .Length Then Cells(x + 1, 1) = .Item(0).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
End With
With post.getElementsByClassName("businessCapsule--tel")
If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
End With
Next post
End Sub
EDIT:
Could be something like this. The first page link is created for i=-1 and then the next pages as ussual.
For i = -1 To page.Length - 2
If i = -1 Then
newlink = mlink & Replace(page(i + 1).href, "about:", "")
newlink = Left(newlink, Len(newlink) - 1) & "1"
Else
newlink = mlink & Replace(page(i).href, "about:", "")
End If
Debug.Print i & ", " & newlink ' Prints the links for all the pages
With http
.Open "GET", newlink, False
.send
htm.body.innerHTML = .responseText
End With
' Get page data here ...
Next i
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With