const Apify = require('apify'); var total_data=[]; const regex_name = /[A-Z][a-z]+\s[A-Z][a-z]+(?=\.|,|\s|\!|\?)/gm const regex_address = /stand:(<\/strong>)?\s+(\w+\s+\w+),?\s+(\w+\s+\w+)?/gm; const regex_email = /(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/i; Apify.main(async () => { const requestQueue = await Apify.openRequestQueue('123'); await requestQueue.addRequest(new Apify.Request({ url: 'https://www.freeletics.com/de/pages/imprint/' })); await requestQueue.addRequest(new Apify.Request({ url: 'https://di1ara.com/pages/impressum' })); console.log('\nStart PuppeteerCrawler\n'); const crawler = new Apify.PuppeteerCrawler({ requestQueue, handlePageFunction: async ({ request, page }) => { const title = await page.title(); console.log(`Title of ${request.url}: ${title}`); const page_content = await page.content(); console.log(`Page content size:`, page_content.length); let obj = { 'url' : request.url }; console.log('Names:'); while ((m = regex_name.exec(page_content)) !== null) { // This is necessary to avoid infinite loops with zero-width matches if (m.index === regex_name.lastIndex) { regex_name.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, groupIndex) => { console.log(`Found match, group ${groupIndex}: ${match}`); if (match !='undefined' ) { obj['names'] += match + ', '; } }); } console.log('\nAddress:'); while ((m = regex_address.exec(page_content)) !== null) { // This is necessary to avoid infinite loops with zero-width matches if (m.index === regex_address.lastIndex) { regex_address.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, groupIndex) => { console.log(`Found match, group ${groupIndex}: ${match}`); }); m[0] = m[0].includes('</strong>') ? m[0].split('</strong>')[1] : m[0]; m[0] = m[0].replace('<', ''); obj['address']= m[0] ?? ''; } console.log('\Email:'); while ((m = regex_email.exec(page_content)) !== null) { // This is necessary to avoid infinite loops with zero-width matches if (m.index === regex_email.lastIndex) { regex_email.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, groupIndex) => { console.log(`Found match, group ${groupIndex}: ${match}`); }); if (m[0]) { obj['email'] = m[0]; break; } } total_data.push(obj); console.log(obj); }, maxRequestsPerCrawl: 2000000, maxConcurrency: 20, }); await crawler.run(); console.log('Total data:'); console.log(total_data); });
Categories