How to handle cookie, user-agent, headers when scraping with JAVA? We’ll use for this a static class ScrapeHelper
that easily handles all of this. The class uses Jsoup library methods to fetch from data from server and parse html into DOM document.
The ScrapeHelper class
package com; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; public class ScrapeHelper { public static String getText(Element document, String selector) { Elements elements = document.select(selector); if(elements.size() > 0) return elements.get(0).text().trim(); else return ""; } public static List<String> getTexts(Element document, String selector) { Elements elements = document.select(selector); if(elements.size() > 0) return elements.eachText().stream().filter(str -> !str.isEmpty()).collect(Collectors.toList()); else return new ArrayList<>(); } public static String getAttribute(Element document, String selector, String attribute) { Elements elements = document.select(selector); if(elements.size() > 0) return elements.get(0).attr(attribute).trim(); else return ""; } public static List<String> getAttributes(Element document, String selector, String attribute) { Elements elements = document.select(selector); if(elements.size() > 0) return elements.eachAttr(attribute); else return new ArrayList<>(); } public static String getInnerHtml(Element document, String selector) { Elements elements = document.select(selector); if(elements.size() > 0) return elements.get(0).html(); else return ""; } public static List<String> getInnerHtmls(Element document, String selector) { Elements elements = document.select(selector); if(elements.size() > 0) { List<String> result = new ArrayList<>(); for(Element element : elements) result.add(element.html()); return result; } else return new ArrayList<>(); } public static Document goToPageAndGetDocument(String link, String userAgent, Map<String,String> headers, int maxRetries ) { int failCounter = 0; if(maxRetries <= 0) maxRetries = 1; while (failCounter < maxRetries) { try{ // first request to get initial cookies Connection.Response response = Jsoup.connect(link) .userAgent(userAgent) .execute(); // main request for data return Jsoup.connect(link) .userAgent(userAgent) //.header("Accept-Language", "en-US") .headers(headers) .cookies(response.cookies()) .get(); } catch (Exception e) { e.printStackTrace(); failCounter++; } } // return an empty document, an instance of the type Document. return Jsoup.parse("<!DOCTYPE html><html><head><title></title></head><body></body></html>"); } public static String joinBaseUrlAndLink(String baseUrl, String link) { if(!link.startsWith("http") && !link.isEmpty()) { if(!link.startsWith("/") && !baseUrl.endsWith("/")) link = baseUrl + "/" + link; else link = baseUrl + link; } return link; } private ScrapeHelper() {} }
The usage of ScrapeHelper class
import com.ScrapeHelper; public class Main { private static void main(String[] args) { Link url = "https://www.amazon.com/Scraping-Javascript-dependent-website-Puppeteer-asynchronous-ebook/dp/B08DJ8VHL2/"; // user-agent String userAgent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"; int maxRetries = 5; // Headers Map<String, String> headers = new HashMap(); headers.put("Accept-Language", "en-US"); headers.put("Accept-Encoding", "gzip,deflate,sdch"); // main call, cookies are fetched inside the method Document doc = ScrapeHelper.goToPageAndGetDocument(link, userAgent, headers, maxRetries ); String html = doc.html(); List<String> productURLs = ScrapeHelper.getAttributes(doc, "a.major-links", "href"); } }