Categories
Development

Simple JAVA scraper that handles user-agent, headers and cookies

How to handle cookie, user-agent, headers when scraping with JAVA? We’ll use for this a static class ScrapeHelper that easily handles all of this. The class uses Jsoup library methods to fetch from data from server and parse html into DOM document.

The ScrapeHelper class

package com;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

public class ScrapeHelper {

    public static String getText(Element document, String selector) {
        Elements elements = document.select(selector);
        if(elements.size() > 0)
            return elements.get(0).text().trim();
        else
            return "";
    }

    public static List<String> getTexts(Element document, String selector) {
        Elements elements = document.select(selector);
        if(elements.size() > 0)
            return elements.eachText().stream().filter(str -> !str.isEmpty()).collect(Collectors.toList());
        else
            return new ArrayList<>();
    }

    public static String getAttribute(Element document, String selector, String attribute) {
        Elements elements = document.select(selector);
        if(elements.size() > 0)
            return elements.get(0).attr(attribute).trim();
        else
            return "";
    }

    public static List<String> getAttributes(Element document, String selector, String attribute) {
        Elements elements = document.select(selector);
        if(elements.size() > 0)
            return elements.eachAttr(attribute);
        else
            return new ArrayList<>();
    }

    public static String getInnerHtml(Element document, String selector) {
        Elements elements = document.select(selector);
        if(elements.size() > 0)
            return elements.get(0).html();
        else
            return "";
    }

    public static List<String> getInnerHtmls(Element document, String selector) {
        Elements elements = document.select(selector);
        if(elements.size() > 0) {
            List<String> result = new ArrayList<>();
            for(Element element : elements)
                result.add(element.html());
            return result;
        }
        else
            return new ArrayList<>();
    }

    public static Document goToPageAndGetDocument(String link, String userAgent, Map<String,​String> headers, int maxRetries ) {
        int failCounter = 0;
        if(maxRetries <= 0)
            maxRetries = 1;
        while (failCounter < maxRetries) {
            try{
				// first request to get initial cookies
                Connection.Response response = Jsoup.connect(link)
					.userAgent(userAgent) 
					.execute();
				
				// main request for data
                return Jsoup.connect(link)
					.userAgent(userAgent)
					//.header("Accept-Language", "en-US") 
					.headers(headers)
					.cookies(response.cookies())
					.get();
            }
            catch (Exception e) {
                e.printStackTrace();
                failCounter++;
            }
        }
		// return an empty document, an instance of the type Document. 
        return Jsoup.parse("<!DOCTYPE html><html><head><title></title></head><body></body></html>");
    }
  
    public static String joinBaseUrlAndLink(String baseUrl, String link) {
        if(!link.startsWith("http") && !link.isEmpty()) {
            if(!link.startsWith("/") && !baseUrl.endsWith("/"))
                link = baseUrl + "/" + link;
            else
                link = baseUrl + link;
        }
        return link;
    }

    private ScrapeHelper() {}
}

The usage of ScrapeHelper class

import com.ScrapeHelper;

public class Main {
	private static void main(String[] args) {
		Link url = "https://www.amazon.com/Scraping-Javascript-dependent-website-Puppeteer-asynchronous-ebook/dp/B08DJ8VHL2/";
		
        // user-agent
		String userAgent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36";
		int maxRetries = 5;
		
        // Headers
		Map<String, String> headers = new HashMap();
		headers.put("Accept-Language", "en-US");
		headers.put("Accept-Encoding", "gzip,deflate,sdch");
		
        // main call, cookies are fetched inside the method
		Document doc = ScrapeHelper.goToPageAndGetDocument(link, userAgent,  headers, maxRetries );
		String html = doc.html(); 
		List<String> productURLs = ScrapeHelper.getAttributes(doc, "a.major-links", "href"); 
	}
}

Leave a Reply

Your email address will not be published.

This site uses Akismet to reduce spam. Learn how your comment data is processed.