Read PDF content on a browser using Selenium webdriver

Question

Read PDF content on a browser using Selenium webdriver?

Abha · Answer 1 · Jul 5, 2019

Hey Parvati, you can use Apache PDFBox JAR files to read PDF content on a browser using Selenium Webdriver. You can install Apache PDFBox JAR from here. Then you can simply add Selenium Standalone JAR and PDFBox JAR into the Build path of your JAVA Project. Now you can use following code snippet to read pdf data from a webpage:

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.URL;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.testng.Assert;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;

public class PDFReader {

    WebDriver driver;

    @BeforeTest
    public void setUp() {
        System.setProperty("webdriver.chrome.driver", "C:\\Users\\Abha_Rathour\\Downloads\\chromedriver.exe");

        driver = new ChromeDriver();
    }

    @Test
    public void verifyPDFContent() throws Exception {
        String url = "http://www.africau.edu/images/default/sample.pdf";
        driver.get(url);
        String pdfContent = readPDFContent(driver.getCurrentUrl());

        Assert.assertTrue(pdfContent.contains("This is a small demonstration .pdf file"));
        driver.quit();
    }

    public String readPDFContent(String appUrl) throws Exception {

        URL url = new URL(appUrl);
        InputStream input = url.openStream();
        BufferedInputStream fileToParse = new BufferedInputStream(input);
        PDDocument document = null;
        String output = null;

        try {
            document = PDDocument.load(fileToParse);
            output = new PDFTextStripper().getText(document);
            System.out.println(output);

        } finally {
            if (document != null) {
                document.close();
            }
            fileToParse.close();
            is.close();
        }
        return output;
    }
}