I am trying to scrape multiple URL one by one, then repeat the scrape after one minute.
But I keep getting two errors and was hoping for some help.
I got an error saying:
functions declared within loops referencing an outer scoped variable may lead to confusing semantics
And I get this error when I run the function / code:
TimeoutError: Navigation timeout of 30000 ms exceeded.
My code:
const puppeteer = require("puppeteer");
const urls = [
'https://www.youtube.com/watch?v=cw9FIeHbdB8',
'https://www.youtube.com/watch?v=imy1px59abE',
'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
];
const scrape = async() => {
let browser, page;
try {
browser = await puppeteer.launch({ headless: true });
page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForSelector('.view-count', { visible: true, timeout: 60000 });
const data = await page.evaluate(() => { // functions declared within loops referencing an outer scoped on this line.
return [
JSON.stringify(document.querySelector('#text > a').innerText),
JSON.stringify(document.querySelector('#container > h1').innerText),
JSON.stringify(document.querySelector('.view-count').innerText),
JSON.stringify(document.querySelector('#owner-sub-count').innerText)
];
});
const [channel, title, views, subs] = [JSON.parse(data[0]), JSON.parse(data[1]), JSON.parse(data[2]), JSON.parse(data[3])];
console.log({ channel, title, views, subs });
}
} catch(err) {
console.log(err);
} finally {
if (browser) {
await browser.close();
}
await setTimeout(scrape, 60000); // repeat after one minute after all urls have been scrape.
}
};
scrape();
I would really appreciate any help I could get.
This works. Putting the for loop in a Promise and waitUntil: "networkidle2"
as an option when page.goto()
resolves your problem. You don't need to generate a new browser each time, so it should be declared outside of the for loop.
const puppeteer = require("puppeteer");
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const scrape = async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
new Promise(async (resolve, reject) => {
for (url of urls) {
// your timeout
await page.waitForTimeout(6 * 1000);
await page.goto(`${url}`, {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
await page.waitForSelector(".view-count", {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
const data = await page.evaluate(() => {
return [
JSON.stringify(document.querySelector("#text > a").innerText),
JSON.stringify(document.querySelector("#container > h1").innerText),
JSON.stringify(document.querySelector(".view-count").innerText),
JSON.stringify(document.querySelector("#owner-sub-count").innerText),
];
});
const [channel, title, views, subs] = [
JSON.parse(data[0]),
JSON.parse(data[1]),
JSON.parse(data[2]),
JSON.parse(data[3]),
];
console.log({ channel, title, views, subs });
}
resolve(true);
})
.then(async () => {
await browser.close();
})
.catch((reason) => {
console.log(reason);
});
};
scrape();
#Update As per ggorlen suggestion, the below-refactored code should serve your problem. Comment in the code indicates the purpose of that line
const puppeteer = require("puppeteer");
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const scrape = async () => {
// generate a headless browser instance
const browser = await puppeteer.launch({ headless: true });
// used .entries to get the index and value
for (const [index, url] of urls.entries()) {
// generating a new page for each of the content
const page = await browser.newPage();
// your 60 timeout from 2nd index
if (index > 0) await page.waitForTimeout(60 * 1000);
// wait for the page response to available with 60 seconds timeout (error throw)
await page.goto(`${url}`, {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
// wait for .view-count section to be available
await page.waitForSelector(".view-count");
// don't need json stringify or parse as puppeteer does so
await page.evaluate(() =>
({
channel: document.querySelector("#text > a").innerText,
title: document.querySelector("#container > h1").innerText,
views: document.querySelector(".view-count").innerText,
subs: document.querySelector("#owner-sub-count").innerText
})
).then(data => {
// your scrapped success data
console.log('response', data);
}).catch(reason => {
// your scrapping error reason
console.log('error', reason);
}).finally(async () => {
// close your current page
await page.close();
})
}
// after looping through finally close the browser
await browser.close();
};
scrape();
I'd suggest a design like this:
const puppeteer = require("puppeteer"); // ^21.4.1
const {setTimeout} = require("node:timers/promises");
const scrapeTextSelectors = async (browser, url, textSelectors) => {
let page;
try {
page = await browser.newPage();
page.setDefaultNavigationTimeout(50 * 1000);
await page.goto(url, {waitUntil: "domcontentloaded"});
const dataPromises = textSelectors.map(async ({name, sel}) => {
await page.waitForSelector(sel);
return [name, await page.$eval(sel, e => e.innerText)];
});
return Object.fromEntries(await Promise.all(dataPromises));
}
finally {
await page?.close();
}
};
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const textSelectors = [
{name: "channel", sel: "#text > a"},
{name: "title", sel: "#container > h1"},
{name: "views", sel: ".view-count"},
{name: "subs", sel: "#owner-sub-count"},
];
let browser;
(async () => {
browser = await puppeteer.launch({headless: "new"});
for (;; await setTimeout(60_000)) {
const data = await Promise.allSettled(urls.map(url =>
scrapeTextSelectors(browser, url, textSelectors)
));
console.log(data);
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
A few remarks:
Promise.allSettled
. If you have more URLs, you'll want a task queue or run synchronously over the URLs with a for .. of
loop so you don't outstrip the system's resources. See this answer for elaboration.waitForSelector
on each and every selector rather than just '.view-count'
so you won't miss anything.page.setDefaultNavigationTimeout(60_000);
gives you an adjustable 50-second delay on all operations.Promise.allSettled
lets the caller control what to do if any requests fail. You might want to filter and/or map the data
response to remove the statuses: data.map(({value}) => value)
.return
instead of console.log
data to keep functions flexible. The caller can console.log
in the format they desire, if they desire.page.goto(url)
because we're awaiting selectors on the very next line. "networkidle2"
just slows things down, waiting for network requests that might not impact the selectors we're interested in.JSON.stringify
/JSON.parse
is already called by Puppeteer on the return value of evaluate
so you can skip it in most cases.finally
blocks. await setTimeout(scrape, 60000)
is misplaced.If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With