function strip_tags(str){
const tags = ['a', 'em', 'div', 'span', 'p', 'i', 'button', 'img' ];
const tagsAndContent = ['picture', 'script', 'noscript', 'source'];
for(tag of tagsAndContent){
let regex = new RegExp( '<' + tag+ '.*?</' + tag + '>', 'gim');
str = str.replace( regex ,"");
}
for(tag of tags){
let regex1 = new RegExp( '<' + tag+ '.*?>', 'gim');
let regex2 = new RegExp( '</' + tag+ '>', 'gim');
str = str.replace(regex1,"").replace(regex2,"");
}
return str;
}
Category: Development
JavaScript, Regex match groups
Often we want only a certain info from the matched content. So, groups help with that.
The following example shows how to fetch the [duplicate] entry index from the error message. For that we take 1st group, index “1”:
const regex = /Duplicate entry\s'([^']+)+'/gm;
const str = `{ Error: (conn=42434, no: 1062, SQLState: 23000) Duplicate entry '135' for key 'PRIMARY'
sql: INSERT INTO \`test\` (id , string1) values (?,?) - parameters:[[135,'string 756']]
at Object.module.exports.createError (C:\\Users\\User\\Documents\\RnD\\Node.js\\mercateo-`;
let m;
while ((m = regex.exec(str)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex.lastIndex) {
regex.lastIndex++;
}
// The result can be accessed through the `m`-variable.
m.forEach((match, groupIndex) => {
console.log(`Found match, group ${groupIndex}: ${match}`);
});
}
As developers scrape data off the web, we use Node.js along with handy Cheerio scraper. When fetching .html()
Cheerio parser returns the special symbols as HTML encoded entities, eg.:ä
as ä
ß
as ß
Cheerio developer vindication of the parser action
(1) It’s not the job of a parser to preserve the original document.
(2) .html()
returns an HTML representation of the parsed document, which doesn’t have to be equal to the original document.
source.
Imstall mariadb package:
npm i mariadb
The code
const config = require("./config");
const db = config.database;
const mariadb = require('mariadb');
const pool = mariadb.createPool({
host: db.host,
user: db.user,
password: db.password,
database: db.database,
connectionLimit: 5
});
async function asyncSaveDataDB(data) {
let conn;
try {
conn = await pool.getConnection();
const rows = await conn.query("SELECT 1 as val");
console.log(rows); //[ {val: 1}, meta: ... ]
const res = await conn.query("INSERT INTO test (string1) value (?)", [data]);
console.log(res); // { affectedRows: 1, insertId: 1, warningStatus: 0 }
} catch (err) {
throw err;
} finally {
if (conn) return conn.end();
}
}
async function asyncSaveDataBulkDB(arr) {
let conn;
try {
conn = await pool.getConnection();
conn.batch("INSERT INTO `test` (string1) values (?)", arr)
.then(res => {
console.log(res); // 2
});
} catch (err) {
throw err;
} finally {
if (conn) return conn.end();
}
}
if (module.parent) {
module.exports = { asyncSaveDataDB, asyncSaveDataBulkDB }
} else {
asyncSaveDataBulkDB(['tt6', 'test 8']);
}
Config.js might look like the following:
module.exports = {
database:{
host: "185.221.154.249",
user: "xxxxxxxxx",
password: "xxxxxxxxx",
database: 'xxxxxxxxx'
}
}
Centos 7, Node.js, MySQL connect
I’ve mariadb installed on my VDS with Centos 7.
I’ve installed mysql npm package:
npm i mysql
Yet requesting that package with
var mysql = require('mysql');
has not provided to the successful connection.
While
var mysql = require('mariadb');
has done.
var mysql = require('mariadb');
var con = mysql.createConnection({
host: "localhost",
user: "admin_default",
password: "xxxxxx",
database: 'admin_default'
}).then(function (){
console.log('connected!');
}, function(err){
console.log(err);
});
//console.log(con);
Given: a webpage to scrape.
If you inspect the DOM tree of that page you will find that quite a few tags are having the keyword dist. As an example:
<link rel="shortcut icon" type="image/x-icon" href="/wcsstore/ColesResponsiveStorefrontAssetStore/dist/30e70cfc76bf73d384beffa80ba6cbee/img/favicon.ico">
<link rel="stylesheet" href="/wcsstore/ColesResponsiveStorefrontAssetStore/dist/30e70cfc76bf73d384beffa80ba6cbee/css/google/fonts-Source-Sans-Pro.css" type="text/css" media="screen">
In this post we share with you how to perform web scraping of a JS-rendered website. The tools as seen in the header are JAVA with Selenium library driving headless Chrome instances (download driver) and JSoup as parser to fetch data of the acquired HTML.
Try the code
See the scripts below.
Check if cookies are enabled
function areCookiesEnabled()
{
var cookieEnabled = (navigator.cookieEnabled) ? true : false;
if (typeof navigator.cookieEnabled == "undefined" && !cookieEnabled)
{
document.cookie = "test";
cookieEnabled = (document.cookie.indexOf("test") != -1) ? true : false;
}
return cookieEnabled;
}
Navigator is the interface represents the state and the identity of the user agent. It allows scripts to query it and to register themselves to carry on some activities.
A Navigator
object can be retrieved using the read-only window.navigator
property.
Check if sessionStorage is enabled
function isStorageEnabled() {
try{
sessionStorage.setItem("test","value");
if(sessionStorage.getItem("test") == "value") {
sessionStorage.removeItem("test");
return true;
} else {
return false;
}
} catch(err) {
return false;
}
}