```javascript async requestHandler({ request, page, enqueueLinks }) { // Extract links from the current page // and add them to the crawling queue. console.log(`Handling page: ${page.url()}`); try { /** Implement slow scroling so that images are lazy loaded */ await page.evaluate(() => { window.scrollTo({ top: 0, behavior: 'smooth' // Optional: Use smooth scrolling animation }); }); await page.waitForTimeout(3000); // Adjust the delay as needed let isEndReached = false; while (!isEndReached) { const scrollPositionBefore = await page.evaluate(() => window.scrollY); await page.mouse.wheel(0, 100); // Wait for a short delay to allow the page to render await page.waitForTimeout(50); const scrollPositionAfter = await page.evaluate(() => window.scrollY); if (scrollPositionBefore === scrollPositionAfter) { isEndReached = true; } } // Capture all page links let allLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('a')).map(a => a.href).filter(a => a); }); /** Handle # based links */ let urls = allLinks.map(u => u.split('#')[0]); /** Consider only domain links excluding file links */ const fileRegex = new RegExp('\.(pdf|xlsx?|docx?|rtf|zip?|vcf)', 'i') const base64Regex = new RegExp('data:image\\/[^;]+;base64', 'i'); urls = urls.filter(u => u.startsWith(websiteURL) && !fileRegex.test(u) && !base64Regex.test(u)); urls = urls.map(u => { const url = new URL(u); for (const [key, value] of url.searchParams) { for (const blacklistKey of queryBlackList) { if (key.includes(blacklistKey)) { console.log('Removing query param:', key); url.searchParams.delete(key); url.search = url.searchParams.toString(); break; } } } return url.href; }); /** Normalize URL due to endless looping where URLs like http://example.com/blog/blog/page/blog/blog/blog/blog are picked up */ const normalizeUrl = (url) => { try { const parsedUrl = new URL(url); const segments = parsedUrl.pathname.split('/').filter(Boolean); const cleanedSegments = []; for (const segment of segments) { // Create a regex pattern to match whole words const isBlacklisted = segmentsBlackList.some(blacklisted => { const regex = new RegExp(`\\b${blacklisted}\\b`, 'i'); return regex.test(segment); }); if (isBlacklisted) { console.log(`Filtering out URL due to blacklisted segment: ${url}`); return null; } if (!cleanedSegments.includes(segment)) { cleanedSegments.push(segment); } } parsedUrl.pathname = '/' + cleanedSegments.join('/'); return parsedUrl.toString(); } catch (e) { console.error('Invalid URL:', url); console.log(e) return null; } }; urls = urls.map(normalizeUrl).filter(u => u !== null); // Filter out null values const result = await enqueueLinks({ urls }); const enqueued = result.processedRequests.filter(r => !r.wasAlreadyPresent && !r.wasAlreadyHandled); console.log(`New urls enqueued : ${enqueued.length} (/${urls.length} total found)`); await page.waitForLoadState('load'); } catch (error) { console.log('Request Handler Error: ' + error); } }, preNavigationHooks: [ async (crawlingContext) => { try { const { page, request, settings } = crawlingContext; page.on('request', async (pageobj) => { const requestUrl = pageobj.url(); if (pageobj.resourceType() === 'image') { // try { // var requestSize = await pageobj.sizes(); // } catch (error) { // //console.log(error); // var requestSize = 1751; // } if (requestUrl.match(excludedImgUrls) == null && requestUrl.length > 0 && imgUrlTracking.indexOf(requestUrl) == -1) { cb.push({ imgurl: requestUrl, pageurl: [request.url] }); imgUrlTracking.push(requestUrl); } else if (imgUrlTracking.indexOf(requestUrl) > -1) { cb.forEach((item, index) => { if (item.imgurl === requestUrl && cb[index].pageurl.length < 250) { cb[index].pageurl.push(request.url); } }); } } }) } catch (error) { console.log('Request Hook Error: ' + error); } }, ], postNavigationHooks: [ async (crawlingContext) => { const { page, request, settings } = crawlingContext; const InfiniteScrollOptions = { scrollDownAndUp: false, waitForSecs: 15, timeoutSecs: 25, }; try { await playwrightUtils.infiniteScroll(page, InfiniteScrollOptions) } catch (error) { console.log('Infinite Scroll ERROR: ' + error); } }, ] // ```