Summary: I want to iterate through a requests payload, so that I can change the log-in ID number for each scrape.
I'm using requests & beautiful soup to do a web scrape.
To log-in to the page, I need to enter a unique ID number; I have a list of such numbers, called hit_list.
For any given ID number, this script works absolutely fine. But what I want to do is automate it so that it runs through my entire hit_list
In other words, I want num in payload_1 to change for each iteration. At present num remains constant and the scrape just iterates according to the length of hit_list (i.e. in this case the same scrape would run five times)
Please note, I'm very new to coding and this is my first project. I'm aware there are likely to be problems with it and am happy to receive constructive criticism.
Importing Libraries
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl
#Recording time @ Start
startTime = datetime.now()
print(datetime.now())
#use pymysql to create database- omitted here for parsimony
#This is a sample list, in reality the list will have 100,000 + numbers.
hit_list = [100100403,100100965,100101047,100100874,100100783]
"""
This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem
wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx')
sheet= wb.get_sheet_by_name('Sheet1')
type(wb)
#LOUIS: Only importing first twenty (for trial purposes)
for id in range(1,20):
hit_list.append(sheet.cell(row=id, column =1).value)
"""
def web_scrape():
#I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish.
#Open page
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
with requests.session() as r:
r.headers.update({
'user-agent': 'For more information on this data collection please contact **************************************'
})
for num in hit_list:
#***LOCATION OF THE PROBLEM***
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search',
'__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=',
}
r.post(url, data=payload_1)
#click intermediate page
payload_2 = {
'__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
'__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=",
'__VIEWSTATEGENERATOR':"1F9CCB97",
'__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==",
}
s=r.post(url, data=payload_2)
#scrape the page
soup = bs4.BeautifulSoup(s.content, 'html.parser')
"""
FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED
"""
print('\nBEGINNING SCRAPE....')
# First Section
ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})
#Address- clean scrape
address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'})
address = (address.get_text(',').strip())
print('address:', address)
#Date of Issue- clean scrape
date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'})
date_issue = date_issue1.find('div', {'class':'formControlReadonly'})
date_issue = (date_issue.get_text().strip())
print('date_of_issue:',date_issue)
#MPRN -Clean scrape
MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'})
MPRN = MPRN1.find('div',{'class':'formControlReadonly'})
MPRN = MPRN.get_text().strip()
print('MPRN:', MPRN)
#Emissions Indicator- clean scrape
emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'})
emissions_indicator_bunched = emissions_indicator1.get_text().strip()
print('\n\nem_bunched:',emissions_indicator_bunched)
emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split()
print('emissions_indicator:',emissions_indicator)
emissions_indicator_unit= emissions_indicator_unit.replace("(","")
emissions_indicator_unit=emissions_indicator_unit.replace(")","")
print('emissions_indicator_unit:',emissions_indicator_unit)
#BER Score- clean scrape
BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'})
BER_bunched =(BER_bunched.get_text().strip())
print ('\n \nBER_bunched:', BER_bunched)
BER_score, BER_actual_rating, BER_unit = BER_bunched.split()
print('\nBER_score:',BER_score)
print('\nBER_actual_rating:',BER_actual_rating)
BER_unit = BER_unit.replace("(", " ")
BER_unit = BER_unit.replace(")","")
print('\nClean_BER_unit:',BER_unit )
#Type of Rating- clean scrape
type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'})
type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'})
type_of_rating = type_of_rating.get_text().strip()
print('type_of_rating:',type_of_rating )
# Second Section
dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'})
#Dwelling Type- clean scrape
dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'})
dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'})
dwelling_type = dwelling_type.get_text().strip()
print ('Dwelling Type:', dwelling_type)
#Number of Stories- clean scrape
num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'})
num_stories = num_stories1.find('div',{'class':'formControlReadonly'})
num_stories = num_stories.get_text().strip()
print('Number of Stories:', num_stories)
#Year of Construction- clean scrape
yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'})
yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'})
yr_construction = yr_construction.get_text().strip()
print('Year of Construction:', yr_construction)
#Floor Area- clean scrape
floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'})
floor_area = floor_area.get_text().strip()
floor_area, floor_area_unit =floor_area.split()
floor_area_unit = floor_area_unit.replace("(","")
floor_area_unit=floor_area_unit.replace(")","")
print('\nFloor Area:', floor_area)
print('floor_area_unit:', floor_area_unit)
#Wall Type- clean scrape
wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'})
wall_type = wall_type1.find('div',{'class':'formControlReadonly'})
wall_type= wall_type.get_text().strip()
print('Wall Type:', wall_type)
#Glazing Type- clean scrape
glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'})
glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'})
glazing_type = glazing_type.get_text().strip()
print('Glazing Type:', glazing_type)
#Percent Low Energy Lighting- clean scrape
percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'})
percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'})
percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip()
print('% Low Energy Lighting:', percent_low_energy_lighting)
#Space Heating Fuel- clean scrape
space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'})
space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'})
space_heating_fuel = space_heating_fuel.get_text().strip()
print('Space Heating Fuel:',space_heating_fuel)
#Space Heating Efficiency- clean scrape
space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'})
space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'})
space_heating_efficiency= space_heating_efficiency.get_text().strip()
print('Space Heating Efficiency:', space_heating_efficiency)
#Water Heatng Fuel- clean scrape
water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'})
water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'})
water_heating_fuel = water_heating_fuel.get_text().strip()
print('Water Heating Fuel:', water_heating_fuel)
#Water Heating Efficiency- clean scrape
water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'})
water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'})
water_heating_efficiency= water_heating_efficiency.get_text().strip()
print('Water Heating Efficiency:', water_heating_efficiency)
#thrid section
assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'})
#Assessor Number- clean scrape
assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'})
assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'})
assessor_num= assessor_num.get_text().strip()
print('Assessor Number:', assessor_num)
print('BER:', num)
print('\***************nSCRAPE FINISHED***************\n')
#Populate datebase
print('\nRECONNECTING WITH DATABASE')
with connection.cursor() as cursor:
print('SUCCESSFUL CONNECTION')
sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating))
print('ROW POPULATED')
#Calling the function
web_scrape()
#Metadata
print('Gathering Details...')
Run_time = datetime.now() - startTime
print('Run Time:', Run_time)
#Loop Finished
print('\n***************PROGRAMME FINISHED***************')
You need to get new __EVENTVALIDATION tokens etc... for each post, you cannot just copy values from your browser and hard code them into your post data:
import requests
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
hit_list = [100100403, 100100965, 100101047, 100100874, 100100783]
h = {}
def renew(s):
soup = BeautifulSoup(s.get(url).content,"html.parser.)
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
with requests.session() as s:
for num in hit_list:
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': num,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
payload_1.update(renew(s))
r = s.post(url, data=payload_1)
# scrape the page
soup = BeautifulSoup(r.content, 'html.parser')
If we run the code and parse a bit of what is returned, you can see we get each page correctly:
In [8]: with requests.session() as s:
...: for num in hit_list:
...: payload_1 = {
...: 'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': str(num),
...: 'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
...: payload_1.update(renew(s))
...: r = s.post(url, data=payload_1)
...: soup = BeautifulSoup(r.content, 'html.parser')
...: spans = soup.select("#ctl00_DefaultContent_BERSearch_gridRatings_gridview tr.GridRowStyle td span")
...: print(spans)
...:
[<span>BER</span>, <span>10003467711</span>, <span>07-01-2009</span>, <span>24 CLONEE COURT\rMAIN STREET\rCLONEE\rCO. MEATH</span>]
[<span>BER</span>, <span>10301654014</span>, <span>26-11-2014</span>, <span>19 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10002082335</span>, <span>08-01-2009</span>, <span>8 CANNON PLACE\r1 HERBERT ROAD\rDUBLIN 4</span>]
[<span>BER</span>, <span>10301653940</span>, <span>18-01-2015</span>, <span>12 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10010500405</span>, <span>07-01-2009</span>, <span>13 RENMORE ROAD\rGALWAY CITY</span>]
That gives you all the info from the table bar the BER cert number, you already have that so you don't need to worry about it.
As you figured out you just need to pass the data to your second payload from what is returned from first post, if you encapsulate the logic in functions it will also make your code a bit easier to manage:
def renew(soup):
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
def parse_data(soup):
address = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress").text.strip()
MPRN = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_container_MPRN div.formControlReadonly").text.strip()
emissions_indicator, emissions_indicator_unit = soup.select_one(
"#ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue").text.split()
emissions_indicator_unit = emissions_indicator_unit.strip("()")
BER_score, BER_actual_rating, BER_unit = soup.select_one(
"#ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating").text.split()
BER_unit = BER_unit.strip("()")
return {"MPRN": MPRN, "emissions_indicator": emissions_indicator,
"emissions_indicator_unit": emissions_indicator_unit,
"BER_score": BER_score, "BER_actual_rating": BER_actual_rating,
"BER_unit": BER_unit, "address": address}
def submint_to_db(dct):
with connection.cursor() as cursor:
print('SUCCESSFUL CONNECTION')
sql = "INSERT INTO table1 ( %s ) VALUES ( %s )" % (",".join(dct), ', '.join(['%s'] * len(dct)))
cursor.execute(sql, dct.values())
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
payload_2 = {
'__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
}
with requests.session() as s:
tokens = renew(BeautifulSoup(requests.get(url).content, "html.parser"))
for num in hit_list:
# update the post data with new token values
payload_1['ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber'] = num
payload_1.update(tokens)
r = s.post(url, data=payload_1)
tokens2 = renew(BeautifulSoup(r.content, 'html.parser'))
payload_2.update(tokens2)
soup = BeautifulSoup(requests.post(url, data=payload_2).content, "html.parser")
submint_to_db(parse_data(soup))
I have not parsed all the data but the logic is the same for the rest, printing the dicts returned for what is parsed will give you:
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '57.83', 'address': '24 CLONEE COURTMAIN STREETCLONEECO. MEATH', 'BER_score': 'D1', 'BER_actual_rating': '235.54', 'MPRN': '10003467711'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '42.4', 'address': '19 GORTANORADINGLECO. KERRY', 'BER_score': 'C1', 'BER_actual_rating': '165.79', 'MPRN': '10301654014'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '34.03', 'address': '8 CANNON PLACE1 HERBERT ROADDUBLIN 4', 'BER_score': 'C2', 'BER_actual_rating': '175.32', 'MPRN': '10002082335'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '53.51', 'address': '12 GORTANORADINGLECO. KERRY', 'BER_score': 'C3', 'BER_actual_rating': '208.45', 'MPRN': '10301653940'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '121.54', 'address': '13 RENMORE ROADGALWAY CITY', 'BER_score': 'G', 'BER_actual_rating': '472.19', 'MPRN': '10010500405'}
@PadraicCunningham provided most of the logic for this answer, but as my comment below his answer describes, his solution only gets me half way.
I have been able to build on his work to solve the problem.
There was just one more step to complete, which was to 'click through' an intermediary' page, which led to where the data I wanted to scrape lies.
Apologies in advance for my non-standard labelling and formatting. I'm a beginner.
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl
hit_list = [100100403,100100965,100101047,100100874,100100783] #this is a sample list
#Open page
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
def field_update(s):
soup = bs4.BeautifulSoup(s.get(url).content,"html.parser")
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR") ["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
print('field updated')
with requests.session() as s:
for ber in hit_list:
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': ber,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
payload_1.update(field_update(s))
r = s.post(url, data=payload_1)
#'click through' intermediate page
#THIS IS THE ADDITIONAL CODE THAT BUILDS ON PADRAIC'S ANSWER
soup = bs4.BeautifulSoup(r.content,"html.parser")
stage_two= {
"__EVENTTARGET": 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
q=s.post(url, data=stage_two)
print('payload_2 posted')
soup = bs4.BeautifulSoup(q.content, 'html.parser')
print('\nBEGINNING SCRAPE....')
#FOR DATA TO BE SCRAPED, SEE ORIGINAL QUESTION
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With