Monday, 7 March 2016

PDFBox: Extract text from PDF using PDFTextStripper

In this post, I am going to explain how to extract data from PDF file using PDFTextStripper.

Following statements are used to write PDF data to a StringWriter.

PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
stripper.setAddMoreFormatting(true);
stripper.getText(pdDoc)

PDFTextStripper class provides following methods to specify the range of pages that you want to be extracted.

Method
Description
public void setStartPage(int startPageValue)
This will set the first page to be extracted by this class.
public void setEndPage(int endPageValue)
This will set the last page to be extracted by this class.

Note
a.   If you don’t set start page and end page, by default all pages in the pdf document are extracted.
b.   If you set only startPage property , then all the data from startPage to end of the pdf is extracted.
c.    If you set only endPage property , then all the data from PDF starting page to endPage is extracted.
d.   The startPage and endPage properties of PDFTextStripper are 1 based and inclusive.


Following is the complete working application.
import java.io.File;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDFTextStripperUtil {

 /**
  * @param fileName
  * @return complete file data as string
  * @throws NullPointerException
  *             if fileName is null
  */
 public static Optional<String> getDataAsString(final String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {

   PDFTextStripper stripper = new PDFTextStripper();
   stripper.setLineSeparator("\n");
   stripper.setAddMoreFormatting(true);

   return Optional.of(stripper.getText(pdDoc));

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }

 }

 public static Optional<String> getDataAsString(final String fileName,
   final int startPage, final int endPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (startPage < 1 || endPage < 1 || endPage < startPage) {
   throw new IllegalArgumentException(
     "startPage, endPage must >= 1 and  endPage >= startPage");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {

   PDFTextStripper stripper = new PDFTextStripper();
   stripper.setLineSeparator("\n");
   stripper.setAddMoreFormatting(true);
   stripper.setStartPage(startPage);
   stripper.setEndPage(endPage);

   return Optional.of(stripper.getText(pdDoc));

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<String> getDataAsStringFromStartPage(
   String fileName, int startPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (startPage < 1) {
   throw new IllegalArgumentException("startPage must >= 1");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   int noOfPages = pdDoc.getNumberOfPages();
   return getDataAsString(fileName, startPage, noOfPages);

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<String> getDataAsStringTillEndPage(String fileName,
   int endPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (endPage < 1) {
   throw new IllegalArgumentException("endPage must >= 1");
  }

  return getDataAsString(fileName, 1, endPage);
 }

 public static Optional<Integer> getNumberOfPages(String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }
  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   return Optional.of(pdDoc.getNumberOfPages());
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }
}

import java.util.Optional;

public class PDFTextStripperUtilTest {
 public static void main(String args[]) {
  Optional<Integer> data = PDFTextStripperUtil
    .getNumberOfPages("/Users/harikrishna_gurram/Downloads/Saurabh.pdf");

  if (data.isPresent()) {
   System.out.println("Total Number Of pages : " +data.get());
  } else {
   System.out.println("Caught with exception while processing");
  }
 }
}





Previous                                                 Next                                                 Home

No comments:

Post a Comment