Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Scraping URLs from web pages using Selenium Python (NSFW)

I'm learning Python by trying to write a script to scrape xHamster. If anyone's familiar with the site, I'm trying to specifically write all URLs of a given user's videos to a .txt file.

Currently, I've managed to scrape the URLs off a specific page, however there are multiple pages and I'm struggling to loop through the number of pages.

In my attempt below I've commented where I'm trying to read the URL of the next page, however it current prints None. Any ideas why and how to resolve this?

Current script:

#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")

driver = webdriver.Chrome(chrome_options=chrome_options)

username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"

driver.implicitly_wait(10)
driver.get(url)

links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('last')

noOfLinks = len(links)
count = 0

file = open('x--' + username + '.txt','w')
while count < noOfLinks:
    #print links[count].get_attribute('href')
    file.write(links[count].get_attribute('href') + '\n');
    count += 1

file.close()
driver.close()

My attempt at looping through the pages:

#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")

driver = webdriver.Chrome(chrome_options=chrome_options)

username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"

driver.implicitly_wait(10)
driver.get(url)

links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('colR')

## TRYING TO READ THE NEXT PAGE HERE
print driver.find_element_by_class_name('last').get_attribute('href')

noOfLinks = len(links)
count = 0

file = open('x--' + username + '.txt','w')
while count < noOfLinks:
    #print links[count].get_attribute('href')
    file.write(links[count].get_attribute('href') + '\n');
    count += 1

file.close()
driver.close()

UPDATE:

I've used Philippe Oger's answer below but modified the two methods below to work for single page results:

def find_max_pagination(self):
    start_url = 'https://www.xhamster.com/user/video/{}/new-1.html'.format(self.user)
    r = requests.get(start_url)
    tree = html.fromstring(r.content)
    abc = tree.xpath('//div[@class="pager"]/table/tr/td/div/a')
    if tree.xpath('//div[@class="pager"]/table/tr/td/div/a'):
        self.max_page = max(
            [int(x.text) for x in tree.xpath('//div[@class="pager"]/table/tr/td/div/a') if x.text not in [None, '...']]
        )
    else:
        self.max_page = 1

    return self.max_page

def generate_listing_urls(self):
    if self.max_page == 1:
        pages = [self.paginated_listing_page(str(page)) for page in range(0, 1)]
    else:
        pages = [self.paginated_listing_page(str(page)) for page in range(0, self.max_page)]

    return pages
like image 643
kong88 Avatar asked May 25 '26 05:05

kong88


1 Answers

On a user page we can actually find out how far the pagination goes, so instead of looping though the pagination, we can generate each url of the user with a list comprehension, and then scraped those one by one.

Here are my two cents using LXML. If you simply copy/paste this code, it will return every video urls in a TXT file. You only need to change the user name.

from lxml import html
import requests


class XXXVideosScraper(object):

    def __init__(self, user):
        self.user = user
        self.max_page = None
        self.video_urls = list()

    def run(self):
        self.find_max_pagination()
        pages_to_crawl = self.generate_listing_urls()
        for page in pages_to_crawl:
            self.capture_video_urls(page)
        with open('results.txt', 'w') as f:
            for video in self.video_urls:
                f.write(video)
                f.write('\n')

    def find_max_pagination(self):
        start_url = 'https://www.xhamster.com/user/video/{}/new-1.html'.format(self.user)
        r = requests.get(start_url)
        tree = html.fromstring(r.content)

        try:
            self.max_page = max(
            [int(x.text) for x in tree.xpath('//div[@class="pager"]/table/tr/td/div/a') if x.text not in [None, '...']]
        )
        except ValueError:
            self.max_page = 1
        return self.max_page

    def generate_listing_urls(self):
        pages = [self.paginated_listing_page(page) for page in range(1, self.max_page + 1)]
        return pages

    def paginated_listing_page(self, pagination):
        return 'https://www.xhamster.com/user/video/{}/new-{}.html'.format(self.user, str(pagination))

    def capture_video_urls(self, url):
        r = requests.get(url)
        tree = html.fromstring(r.content)
        video_links = tree.xpath('//a[@class="hRotator"]/@href')
        self.video_urls += video_links


if __name__ == '__main__':
    sample_user = 'wearehairy'
    scraper = XXXVideosScraper(sample_user)
    scraper.run()

I have not check the case when there is only 1 page in total for a user. Let me know if this works fine.

like image 113
Philippe Oger Avatar answered May 27 '26 19:05

Philippe Oger



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!