const Apify = require('apify');
var total_data=[];
const regex_name = /[A-Z][a-z]+\s[A-Z][a-z]+(?=\.|,|\s|\!|\?)/gm
const regex_address = /stand:(<\/strong>)?\s+(\w+\s+\w+),?\s+(\w+\s+\w+)?/gm;
const regex_email = /(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/i;
Apify.main(async () => {
const requestQueue = await Apify.openRequestQueue('123');
await requestQueue.addRequest(new Apify.Request({ url: 'https://www.freeletics.com/de/pages/imprint/' }));
await requestQueue.addRequest(new Apify.Request({ url: 'https://di1ara.com/pages/impressum' }));
console.log('\nStart PuppeteerCrawler\n');
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
const title = await page.title();
console.log(`Title of ${request.url}: ${title}`);
const page_content = await page.content();
console.log(`Page content size:`, page_content.length);
let obj = { 'url' : request.url };
console.log('Names:');
while ((m = regex_name.exec(page_content)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex_name.lastIndex) {
regex_name.lastIndex++;
}
// The result can be accessed through the `m`-variable.
m.forEach((match, groupIndex) => {
console.log(`Found match, group ${groupIndex}: ${match}`);
if (match !='undefined' ) {
obj['names'] += match + ', ';
}
});
}
console.log('\nAddress:');
while ((m = regex_address.exec(page_content)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex_address.lastIndex) {
regex_address.lastIndex++;
}
// The result can be accessed through the `m`-variable.
m.forEach((match, groupIndex) => {
console.log(`Found match, group ${groupIndex}: ${match}`);
});
m[0] = m[0].includes('</strong>') ? m[0].split('</strong>')[1] : m[0];
m[0] = m[0].replace('<', '');
obj['address']= m[0] ?? '';
}
console.log('\Email:');
while ((m = regex_email.exec(page_content)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex_email.lastIndex) {
regex_email.lastIndex++;
}
// The result can be accessed through the `m`-variable.
m.forEach((match, groupIndex) => {
console.log(`Found match, group ${groupIndex}: ${match}`);
});
if (m[0])
{
obj['email'] = m[0];
break;
}
}
total_data.push(obj);
console.log(obj);
},
maxRequestsPerCrawl: 2000000,
maxConcurrency: 20,
});
await crawler.run();
console.log('Total data:');
console.log(total_data);
});