Categories
Development

Cheerio.js, get items from html table into object

Suppose there is a table like below (1 info row only):

Blows
Minute (BPM)
Speed (RPM) Power, PSI Flow, PSI
Tool Sys
0-2500 0-250 1.8 HP 2.6-13.2 GPM SDS Max

How to scrape it using cheerio.js as a parser?

Case 1 (1 row only)

1. Load it from file system

fs.readFile('table.html', 'utf8', function(err, data) { 
    if (err) throw err;
    var $table_obj = cheerio.load(data);
    console.log($table.html()); 	
});

2. Procedure to process table

function processTable(cheerio_table_object, remove_tags=[] ){
	let columns = [];
	let items = {};
	// preprocessing, eg. remove tags
	if (remove_tags.length){
		remove_tags.forEach(tag => {  
			cheerio_table_object(tag).replaceWith('');									
		});
	} 
	
	// get columns
	cheerio_table_object('thead tr th').each((index, el) => { 
		columns.push(cheerio_table_object(el).text()); 
	});
	// get data from table
	cheerio_table_object('tr').each((index, tr) => {		
		cheerio_table_object('td:not([colspan])', tr ).each((index, td) => {
			items[columns[index]] = cheerio_table_object(td).text();
		});	 
	});
	return items;
}

3. Now the whole code

var cheerio = require('cheerio');
var fs = require('fs'); 

function processTable(cheerio_table_object, remove_tags=[] ){
	let columns = [];
	let items = {};
	// preprocessing, eg. remove tags
	if (remove_tags.length){
		remove_tags.forEach(tag => {  
			cheerio_table_object(tag).replaceWith('');									
		});
	} 
	
	// get columns
	cheerio_table_object('thead tr th').each((index, el) => { 
		columns.push(cheerio_table_object(el).text()); 
	});
	// get data from table
	cheerio_table_object('tr').each((index, tr) => {		
		cheerio_table_object('td:not([colspan])', tr ).each((index, td) => {
			items[columns[index]] = cheerio_table_object(td).text();
		});	 
	});
	return items;
}

fs.readFile('table.html', 'utf8', function(err, data) { 
    if (err) throw err;
    var $table_obj = cheerio.load(data);
    //console.log($table_obj.html());
	let res = processTable($table_obj, ['br']);
	console.log('Result as an object:', res  );	
	console.log('Result as json:', JSON.stringify( res ));	
});

4. The result output is the following:

Result as an object: {
'Blows // Minute (BPM)': '0-2500',
'Drill Speed (RPM)': '0-250',
'Power at 1160 PSI': '1.8 HP',
'Flow at 1160 PSI': '2.6-13.2 GPM',
'Tool System': 'SDS Max'
}
Result as json: {"Blows // Minute (BPM)":"0-2500","Drill Speed (RPM)":"0-250","
Power at 1160 PSI":"1.8 HP","Flow at 1160 PSI":"2.6-13.2 GPM","Tool System":"SDS Max"}

Case 2 (2 or more rows)

If there is a table like below (2 or more info rows):

Description Model No.
Weight
(lbs.)
Blows
per Minute
CFM 90 Noise
dB(a)
Crawler 2203 7 2400 4 76.3
Scraper 5555 9.5 3000 5.5 78.2

Then the whole code will look differently.

var cheerio = require('cheerio');
var fs = require('fs'); 

function processCase2table(cheerio_table_object, remove_tags=[] ){
	let columns = [];
	let items = {};
	// preprocessing, eg. remove tags
	if (remove_tags.length){
		remove_tags.forEach(tag => {  
			cheerio_table_object(tag).replaceWith('');									
		});
	} 
	
	// get columns
	cheerio_table_object('thead tr th').each((index, el) => { 
		columns.push(cheerio_table_object(el).text()); 
	});
	
	cheerio_table_object('tr').each((tr_index, tr) => {
		let item = {}; 
		// console.log('tr: ', cheerio.load(tr).html() );
		cheerio_table_object('td:not([colspan])', tr ).each((index, td) => { 
			item[columns[index]] = cheerio.load(td).text(); 
		}); 
		// adding item into the items object	
		if (Object.entries(item).length !== 0) {
			items[tr_index] = item;
		}
	});
	 
	return items;
}

fs.readFile('table-case-2.html', 'utf8', function(err, data) { 
    if (err) throw err;
    var $table_obj = cheerio.load(data);
    //console.log($table_obj.html());
	let res = processCase2table($table_obj, ['br']);
	console.log('Result as an object:\n', res  );	
	//console.log('Result as json:', JSON.stringify( res ));	
});

Result:

Result as an object: 
{
  '1': {
    Description: 'Crawler',
    'Model No.': '2203',
    '\n            Weight(lbs.)\n         ': '7',
    'Blowsper Minute': '2400',
    'CFM 90 ': '4',
    'NoisedB(a)': '76.3',
  },
  '2': {
    Description: 'Scraper',
    'Model No.': '5555',
    '\n            Weight(lbs.)\n         ': '9.5',
    'Blowsper Minute': '3000',
    'CFM 90 ': '5.5',
    'NoisedB(a)': '78.2',
  }
}

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.