I am doing scraping of images from Google. I got image link in my script but this have this format
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQh6rPd9hx_fUGzorshx1fG5kzUM5FGCSYmm2YBuLU3uSFFI5BviIWd6hrHbw&s
I opened and here is the image but I cant use urllib.urlretrieve(imagenurl,imagen) for downdload.
Someone know other way for download ? I am using python 2.7
import requests
from bs4 import BeautifulSoup
import urllib
def run():
palabra ='pez'
response = requests.get('https://www.google.com/search?q={}&hl=es&sxsrf=ALeKk00KoMQKffGLNWV5UEKbuPwpySPuig:1596391733831&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiajd7Rjv3qAhXqTN8KHSINBkcQ_AUoAXoECBoQAw&biw=1262&bih=593'.format(palabra))
soup = BeautifulSoup(response.content,'html.parser')
imagenes = soup.find_all('img')
for i in range(1,5):
imagen_i = imagenes[i]['src']
imagen = imagen_i.split('/')[-1]
print(imagen_i)
#urllib.urlretrieve(imagen_i)
if __name__ == '__main__':
run()
I have no problem to download it with urlretrieve on Linux but you should check print(imagen) to see what filenames you use.
It gives names like
images?q=tbn:ANd9GcQh6rPd9hx_fUGzorshx1fG5kzUM5FGCSYmm2YBuLU3uSFFI5BviIWd6hrHbw&s
with chars which may not be allowed in your system - so it may not save it - and you should rather create filenames manually - ie. 1.jpg, 2.jpg, ..., "{}.jpg".format(i) - or you should remove not allowed chars from filename.
import requests
from bs4 import BeautifulSoup
import urllib
palabra ='pez'
response = requests.get('https://www.google.com/search?q={}&hl=es&sxsrf=ALeKk00KoMQKffGLNWV5UEKbuPwpySPuig:1596391733831&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiajd7Rjv3qAhXqTN8KHSINBkcQ_AUoAXoECBoQAw&biw=1262&bih=593'.format(palabra))
soup = BeautifulSoup(response.content,'html.parser')
all_images = soup.find_all('img')
for i, img in enumerate(all_images[1:5], 1):
src = img['src']
print 'src:', src
filename = src.split('/')[-1]
print 'filename:', filename
#import urlparse
#parts = urlparse.urlparse(src)
#query = urlparse.parse_qs(parts.query)
#q = query['q'][0]
#filename = q.split(':')[1]
#print 'filename:', filename
filename = '{}.jpg'.format(i)
print 'filename:', filename
urllib.urlretrieve(src, filename)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With