Suppose there is a table like below (1 info row only):
Blows Minute (BPM) |
Speed (RPM) | Power, PSI | Flow, PSI | Tool Sys |
---|---|---|---|---|
0-2500 | 0-250 | 1.8 HP | 2.6-13.2 GPM | SDS Max |
How to scrape it using cheerio.js as a parser?
Case 1 (1 row only)
1. Load it from file system
fs.readFile('table.html', 'utf8', function(err, data) { if (err) throw err; var $table_obj = cheerio.load(data); console.log($table.html()); });
2. Procedure to process table
function processTable(cheerio_table_object, remove_tags=[] ){ let columns = []; let items = {}; // preprocessing, eg. remove tags if (remove_tags.length){ remove_tags.forEach(tag => { cheerio_table_object(tag).replaceWith(''); }); } // get columns cheerio_table_object('thead tr th').each((index, el) => { columns.push(cheerio_table_object(el).text()); }); // get data from table cheerio_table_object('tr').each((index, tr) => { cheerio_table_object('td:not([colspan])', tr ).each((index, td) => { items[columns[index]] = cheerio_table_object(td).text(); }); }); return items; }
3. Now the whole code
var cheerio = require('cheerio'); var fs = require('fs'); function processTable(cheerio_table_object, remove_tags=[] ){ let columns = []; let items = {}; // preprocessing, eg. remove tags if (remove_tags.length){ remove_tags.forEach(tag => { cheerio_table_object(tag).replaceWith(''); }); } // get columns cheerio_table_object('thead tr th').each((index, el) => { columns.push(cheerio_table_object(el).text()); }); // get data from table cheerio_table_object('tr').each((index, tr) => { cheerio_table_object('td:not([colspan])', tr ).each((index, td) => { items[columns[index]] = cheerio_table_object(td).text(); }); }); return items; } fs.readFile('table.html', 'utf8', function(err, data) { if (err) throw err; var $table_obj = cheerio.load(data); //console.log($table_obj.html()); let res = processTable($table_obj, ['br']); console.log('Result as an object:', res ); console.log('Result as json:', JSON.stringify( res )); });
4. The result output is the following:
Result as an object: {
'Blows // Minute (BPM)': '0-2500',
'Drill Speed (RPM)': '0-250',
'Power at 1160 PSI': '1.8 HP',
'Flow at 1160 PSI': '2.6-13.2 GPM',
'Tool System': 'SDS Max'
}
Result as json: {"Blows // Minute (BPM)":"0-2500","Drill Speed (RPM)":"0-250","
Power at 1160 PSI":"1.8 HP","Flow at 1160 PSI":"2.6-13.2 GPM","Tool System":"SDS Max"}
Case 2 (2 or more rows)
If there is a table like below (2 or more info rows):
Description | Model No. |
Weight
(lbs.) |
Blows per Minute |
CFM 90 | Noise dB(a) |
---|---|---|---|---|---|
Crawler | 2203 | 7 | 2400 | 4 | 76.3 |
Scraper | 5555 | 9.5 | 3000 | 5.5 | 78.2 |
Then the whole code will look differently.
var cheerio = require('cheerio'); var fs = require('fs'); function processCase2table(cheerio_table_object, remove_tags=[] ){ let columns = []; let items = {}; // preprocessing, eg. remove tags if (remove_tags.length){ remove_tags.forEach(tag => { cheerio_table_object(tag).replaceWith(''); }); } // get columns cheerio_table_object('thead tr th').each((index, el) => { columns.push(cheerio_table_object(el).text()); }); cheerio_table_object('tr').each((tr_index, tr) => { let item = {}; // console.log('tr: ', cheerio.load(tr).html() ); cheerio_table_object('td:not([colspan])', tr ).each((index, td) => { item[columns[index]] = cheerio.load(td).text(); }); // adding item into the items object if (Object.entries(item).length !== 0) { items[tr_index] = item; } }); return items; } fs.readFile('table-case-2.html', 'utf8', function(err, data) { if (err) throw err; var $table_obj = cheerio.load(data); //console.log($table_obj.html()); let res = processCase2table($table_obj, ['br']); console.log('Result as an object:\n', res ); //console.log('Result as json:', JSON.stringify( res )); });
Result:
Result as an object: { '1': { Description: 'Crawler', 'Model No.': '2203', '\n Weight(lbs.)\n ': '7', 'Blowsper Minute': '2400', 'CFM 90 ': '4', 'NoisedB(a)': '76.3', }, '2': { Description: 'Scraper', 'Model No.': '5555', '\n Weight(lbs.)\n ': '9.5', 'Blowsper Minute': '3000', 'CFM 90 ': '5.5', 'NoisedB(a)': '78.2', } }
2 replies on “Cheerio.js, get items from html table into object”
It gives me following error:
TypeError: cheerio_table_object is not a function
Then please use the
$( object).children('...')
notation. See the following: