Categories
Challenge Development

Scrape with Google App Script

In this post I want to let you how I’ve managed to complete the challenge of scraping a site with Google Apps Script (GAS).

The Challenge

The challenge was to scrape arbitrary sites and save all the site’s pure text (stripping all the html markup) into a single file. Originally I was going to use python and PHP solutions, but then I thought I’d try using Google App Script instead. And it turned out pretty well.
To handle page retrieval I use

//Fetch page for scrape with Google App Script
var html = UrlFetchApp.fetch(url).getContentText();
var html = UrlFetchApp.fetch(url).getContentText();

Gathering Internal Links

Before we strip the HTML from a given page, we have to gather all the links from it so we can add them to our crawling links array (using regex). Since we only want to crawl inside the site s boundaries, I limited the matches to internal links only.

Gather links for scrape with Google App Script

var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) { // we filter only inner links and not pdf docs
     if (match[1].indexOf(\'#\') !== 0
            || match[1].indexOf(\'http://\') !== 0
            || match[1].indexOf(\'https://\') !== 0
            || match[1].indexOf(\'mailto:\') !== 0
            || match[1].indexOf(\'.pdf\') === -1 )
            {
                inner_links_arr.push(match[1]);
            }
     match = linkRegExp.exec(html);
}

Strip the HTML

For this purpose we use XMLService library. Xml.parse where the second param is true parses the document as an HTML page. Then we walk through the document (that will be patched up with missing HTML and BODY elements, etc. and turned into a valid XHTML document), turning text nodes into text and expanding all other nodes.

Strip html markup for scrape with Google App Script

function getTextFromHtml(html) {
     return getTextFromNode(Xml.parse(html, true).getElement());
}
  
function getTextFromNode(x) {
     switch(x.toString()) {
            case \'XmlText\': return x.toXmlString();
            case \'XmlElement\': return x.getNodes().map(getTextFromNode).join(\' \');
            default: return \'\';
     }
}
//Output into the current google docs file
  
function outputText(url, text){
     var body = DocumentApp.getActiveDocument().getBody();
     body.appendHorizontalRule();
     var section = body.appendParagraph(\' * \' + url); // set current page url in h2 heading
     section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
     body.appendParagraph(text);
}

The Whole Code

Some utility functions I did not mention before, but I think you ll easy catch up with them. 🙂

The entire code of scrape with Google App Script

function onOpen() {
        DocumentApp.getUi() // Or DocumentApp or FormApp.
            .createMenu(\'New scrape web docs\')
            .addItem(\'Enter Url\', \'showPrompt\')
            .addToUi();
}
function showPrompt() {
    var ui = DocumentApp.getUi();    
    var result = ui.prompt(
            \'Scrape whole website into text!\',
            \'Please enter website url (with http(s)://):\',
            ui.ButtonSet.OK_CANCEL);
// Process the user\'s response.
    var button = result.getSelectedButton();
    var url = result.getResponseText();    
    var links=[];    
    var base_url = url;
    
    if (button == ui.Button.OK) { // User clicked "OK".        
        if(!isValidURL(url))
        {
            ui.alert(\'Your url is not valid.\');            
        }
        else {    
            // gather initial links
            var inner_links_arr = scrapeAndPaste(url, 1); // first run and clear the document
            links = links.concat(inner_links_arr); // append an array to all the links
            var new_links=[]; // array for new links    
            var processed_urls =[url]; // processed links
            var link, current;
            while (links.length)
            {    
                 link = links.shift(); // get the most left link (inner url)
                 processed_urls.push(link);
                 current = base_url + link;    
                 new_links = scrapeAndPaste(current, 0); // second and consecutive runs we do not clear up the document
                 //ui.alert(\'Processed... \' + current                                    + \' urned links: \' + new_links.join(\'\\') );
                 // add new links into links array (stack) if appropriate
                 for (var i in new_links){
                     var item = new_links[i];
                     if (links.indexOf(item) === -1 && processed_urls.indexOf(item) === -1)
                             links.push(item);
                 }
                 /* // alert message for debugging
                        ui.alert(\'Links in stack: \' + links.join(\' \')
                            + \' al links in stack: \' + links.length
                            + \' cessed: \' + processed_urls.join(\' \')
                            + \' al processed: \' + processed_urls.length);
                            */
            }
        }
    }
}
  
function scrapeAndPaste(url, clear) {
    var text;
    try {
        var html = UrlFetchApp.fetch(url).getContentText();
        // some html pre-processing
        if (html.indexOf(\'</head>\') !== -1 ){
             html = html.split(\'</head>\')[1];
        }
        if (html.indexOf(\'</body>\') !== -1 ){ // thus we split the body only
             html = html.split(\'</body>\')[0] + \'</body>\';
        }            
     // fetch inner links
        var inner_links_arr= [];
        var linkRegExp = /href="(.*?)"/gi; // regex expression object
        var match = linkRegExp.exec(html);
        while (match != null) {
            // matched text: match[0]
            if (match[1].indexOf(\'#\') !== 0
             || match[1].indexOf(\'http\') !== 0
             || match[1].indexOf(\'https://\') !== 0    
             || match[1].indexOf(\'mailto:\') !== 0
             || match[1].indexOf(\'.pdf\') === -1 ) {
                 inner_links_arr.push(match[1]);
            }        
            // match start: match.index
            // capturing group n: match[n]
            match = linkRegExp.exec(html);
        }
        text = getTextFromHtml(html);
        outputText(url, text, clear); // output text into the current document with given url
        return inner_links_arr; //we return all inner links of this doc as array    
  
    } catch (e) {
        MailApp.sendEmail(Session.getActiveUser().getEmail(), "Scrape error report at "
            + Utilities.formatDate(new Date(), "GMT", "yyyy-MM-dd    HH:mm:ss"),
            "\\r sage: " + e.message
            + "\\r e: " +    e.fileName+ \'.gs\'
            + "\\r  page under scrape: " + url
            + "\\r e: " +    e.lineNumber);
        outputText(url, \'Scrape error for this page cause of malformed html!\', clear);    
    }
}
  
function getTextFromHtml(html) {
    return getTextFromNode(Xml.parse(html, true).getElement());
}
function getTextFromNode(x) {
    switch(x.toString()) {
        case \'XmlText\': return x.toXmlString();
        case \'XmlElement\': return x.getNodes().map(getTextFromNode).join(\' \');
        default: return \'\';
    }
}
  
function outputText(url, text, clear){
    var body = DocumentApp.getActiveDocument().getBody();
    if (clear){
        body.clear();
    }
    else {
        body.appendHorizontalRule();            
    }
    var section = body.appendParagraph(\' * \' + url);
    section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
    body.appendParagraph(text);
}
  
function isValidURL(url){
        var RegExp = /^(([\\w]+:)?\\/\\/)?(([\\d\\w]|%[a-fA-f\\d]{2,2})+(:([\\d\\w]|%[a-fA-f\\d]{2,2})+)?@)?([\\d\\w][-\\d\\w]{0,253}[\\d\\w]\\.)+[\\w]{2,4}(:[\\d]+)?(\\/([-+_~.\\d\\w]|%[a-fA-f\\d]{2,2})*)*(\\?(&?([-+_~.\\d\\w]|%[a-fA-f\\d]{2,2})=?)*)?(#([-+_~.\\d\\w]|%[a-fA-f\\d]{2,2})*)?$/;
        if(RegExp.test(url)){
                return true;
        }else{
                return false;
        }
}

How to automaticlly convert files in Google App Script you might read here.

Disclaimer

The script works well, but the google script execution limitation is 6 min, so for huge sites it won t work (stopped after 6 min and no output in google docs file). I plan on improving it to encompass huge sites scrape and you are welcome to contribute your suggestions for that.

24 replies on “Scrape with Google App Script”

Thanks for cool tutorial . I tried to utilize your code to extract all urls in a page( starting http:// and ending .pdf )what linkRegExp should i use? Hope you help me here thanks.

Thanks for cool tutorial . I tried to utilize your code to extract all urls in a page( starting http:// and ending .pdf )what linkRegExp should i use? Hope you help me here thanks.

Thanks for reply . I tried that and also tried var linkRegExp = /~http:\/\/.*?\~/gi; and the array is always empty after while loop in scrapeAndPaste function(Logger.log(inner_links_arr);)! I just want to extract all the urls in a page starting with http!

Thanks for reply . I tried that and also tried var linkRegExp = /~http:\/\/.*?\~/gi; and the array is always empty after while loop in scrapeAndPaste function(Logger.log(inner_links_arr);)! I just want to extract all the urls in a page starting with http!

For purpose of testing i put the html=”http://www.someurl.com/test.pdf”; instead of pointing to a remote website and still the array is empty . i think the code never enters “while (match != null)” which indicates match is always null!

See in the code:

var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) { // we filter only inner links and not pdf docs
   if (match[1].indexOf('#') !== 0
      && match[1].indexOf('http://') !== 0
      && match[1].indexOf('https://') !== 0
      && match[1].indexOf('mailto:') !== 0
      && match[1].indexOf('.pdf') === -1 ) 
      {
        inner_links_arr.push(match[1]);
      }
   match = linkRegExp.exec(html);
}

There is the comment in the code we filter only inner links and not pdf docs. So to keep the pdf links in, you must remove the check for .pdf in the if- filter:
&& match[1].indexOf(‘.pdf’) === -1 in order to not filter them off. Have you removed this expression from the code?

For purpose of testing i put the html=”http://www.someurl.com/test.pdf”; instead of pointing to a remote website and still the array is empty . i think the code never enters “while (match != null)” which indicates match is always null!

See in the code:

var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) { // we filter only inner links and not pdf docs
   if (match[1].indexOf('#') !== 0
      && match[1].indexOf('http://') !== 0
      && match[1].indexOf('https://') !== 0
      && match[1].indexOf('mailto:') !== 0
      && match[1].indexOf('.pdf') === -1 ) 
      {
        inner_links_arr.push(match[1]);
      }
   match = linkRegExp.exec(html);
}

There is the comment in the code we filter only inner links and not pdf docs. So to keep the pdf links in, you must remove the check for .pdf in the if- filter:
&& match[1].indexOf(‘.pdf’) === -1 in order to not filter them off. Have you removed this expression from the code?

i tried the following code and the logger.log(inner_links_arr) still shows [null]. This is the code:

function test()
{
  html="first pdf http://www.someurl.com/test1.pdf second pdf http://www.someurl.com/test2.pdf";
 var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) { // we filter only inner links and not pdf docs
   if (match[1].indexOf('#') !== 0
      && match[1].indexOf('http://') !== 0
      && match[1].indexOf('https://') !== 0
      && match[1].indexOf('mailto:') !== 0)
      //&& match[1].indexOf('.pdf') === -1 ) 
      {
        inner_links_arr.push(match[1]);
      }
   match = linkRegExp.exec(html);
} 
   Logger.log(inner_links_arr);
}

See my code and the link to the working demo (the 3d link is of .html):

function test()
{
    var html='first pdf  second pdf '+  
              ' assassa' +
              ' assassa';
    var inner_links_arr= [];
    var linkRegExp = /href="(.*?\.pdf)"/gi; // regex expression object
    var match = linkRegExp.exec(html);
    while (match != null) { 
      inner_links_arr.push(match[1]);
      match = linkRegExp.exec(html);
    } 
    var body = DocumentApp.getActiveDocument().getBody();
    body.appendHorizontalRule();
    body.appendParagraph(inner_links_arr);
}

shared with you at panda15@sharklasers.com

i tried the following code and the logger.log(inner_links_arr) still shows [null]. This is the code:

function test()
{
  html="first pdf http://www.someurl.com/test1.pdf second pdf http://www.someurl.com/test2.pdf";
 var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) { // we filter only inner links and not pdf docs
   if (match[1].indexOf('#') !== 0
      && match[1].indexOf('http://') !== 0
      && match[1].indexOf('https://') !== 0
      && match[1].indexOf('mailto:') !== 0)
      //&& match[1].indexOf('.pdf') === -1 ) 
      {
        inner_links_arr.push(match[1]);
      }
   match = linkRegExp.exec(html);
} 
   Logger.log(inner_links_arr);
}

See my code and the link to the working demo (the 3d link is of .html):

function test()
{
    var html='first pdf  second pdf '+  
              ' assassa' +
              ' assassa';
    var inner_links_arr= [];
    var linkRegExp = /href="(.*?\.pdf)"/gi; // regex expression object
    var match = linkRegExp.exec(html);
    while (match != null) { 
      inner_links_arr.push(match[1]);
      match = linkRegExp.exec(html);
    } 
    var body = DocumentApp.getActiveDocument().getBody();
    body.appendHorizontalRule();
    body.appendParagraph(inner_links_arr);
}

shared with you at panda15@sharklasers.com

Hi,

@Igor
How to get the page content within Google Apps Script, when it is loaded by JavaSrcipt/Ajax?

@Jim
Can you please provide the direct link to your script?

Regards,
Peter

Hi,

@Igor
How to get the page content within Google Apps Script, when it is loaded by JavaSrcipt/Ajax?

@Jim
Can you please provide the direct link to your script?

Regards,
Peter

Try using fetchall. It can grab dozens of links async / simultanously and the limit 6 minutes might not be a problem then

I’m not sure of having understood why we need to meet that many conditions for the inner links ? A link can’t have http && https && mailto in the same time ? Would it be || instead of && ?

@igor, I’m from a non-developer background, And I’m facing an issue with these codes to implement in my Google sheet, Please help me with the email scraper Google app script for Google Sheets.

Leave a Reply to Igor Savinkin Cancel reply

Your email address will not be published.

This site uses Akismet to reduce spam. Learn how your comment data is processed.