Monday, 7 March 2016

Apache PDFBox: Utility class to work with PDF documents


Following utility class provides number of methods to work with PDF files. I used following maven dependencies.
<dependencies>
 <dependency>
  <groupId>org.apache.pdfbox</groupId>
  <artifactId>pdfbox</artifactId>
  <version>2.0.0-RC3</version>
 </dependency>

 <dependency>
  <groupId>org.apache.pdfbox</groupId>
  <artifactId>pdfbox-lucene</artifactId>
  <version>1.8.11</version>
 </dependency>

 <dependency>
  <groupId>org.bouncycastle</groupId>
  <artifactId>bcprov-jdk16</artifactId>
  <version>1.46</version>
 </dependency>

 <dependency>
  <groupId>commons-io</groupId>
  <artifactId>commons-io</artifactId>
  <version>2.4</version>
 </dependency>

 <dependency>
  <groupId>org.apache.pdfbox</groupId>
  <artifactId>preflight</artifactId>
  <version>2.0.0-RC3</version>
 </dependency>

</dependencies>

Following is the brief summary of the methods defined in the utility class.
Method
Description
public static Optional<String> getDataAsString(final String fileName)
Return pdf file data as a string.
public static Optional<String> getDataAsString(final String fileName,final int startPage, final int endPage)
Return pdf file data as a string from given start page to end page.
public static Optional<String> getDataAsStringFromStartPage
Return pdf file data as a string from given page to the end.
public static Optional<String> getDataAsStringTillEndPage(String fileName,int endPage)
Return pdf file data as a string from startPage to the endPage.
public static Optional<Integer> getNumberOfPages(String fileName)
Get number of pages.
public static Optional<Map<String, Object>> getDocumentBasicMetaData(final String fileName)
Get basic metadata of PDF document.
public static Optional<List<String>> getCatalogMetaData(final String fileName)
Get catalog meta data.
private static Optional<List<String>> getDataFromStream(InputStream in)
Read data from input stream.
private static Optional<List<String>> getMeatData(PDMetadata metadata)
Get meta data from PDMetadata.
public static Optional<List<String>> getPDPageMetaData(final String fileName, int pageIndex)
Get page metadata.
public static boolean encryptDocument(final String fileName,final String encryptedFileName, final int keyLength,final String ownerPassword, final String userPassword)
Encrypt PDF document.
public static boolean encryptDocument(final String fileName,final String encryptedFileName, final String ownerPassword,final String userPassword)
Encrypt PDF document. Uses keylength 128
public static boolean encryptDocument(final String fileName,final String ownerPassword, final String userPassword)
Encrypt PDF document. Uses keylength 128, result is stored in encrypted_ followed by fileName.pdf.
public static boolean addAtachement(final String fileName,final String... attachements)
Add attachments to PDF file.
public static Optional<Set<String>> getAttachements(final String fileName)
Return all attachments.
public static void extractAttachements(final String fileName,final String destDirectory)
Extract attachments to destination directory.
public static Optional<ValidationResult> getValidationResult(String fileName)
Validate PDF Document.
public static boolean isValidPDF(String fileName)
Return true, if PDF is valid PDF/A-1b file, else false.
private static void copyFileToDirectory(final String srcFile,final String destDir)
Copy file to destination directory.


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardProtectionPolicy;
import org.apache.pdfbox.preflight.PreflightDocument;
import org.apache.pdfbox.preflight.ValidationResult;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.pdfbox.text.PDFTextStripper;
/**
 * Utility class to work with PDF files
 * 
 * @author harikrishna_gurram
 *
 */
public class PDFTextStripperUtil {

 /**
  * @param fileName
  * @return complete file data as string
  * @throws NullPointerException
  *             if fileName is null
  */
 public static Optional<String> getDataAsString(final String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {

   PDFTextStripper stripper = new PDFTextStripper();
   stripper.setLineSeparator("\n");
   stripper.setAddMoreFormatting(true);

   return Optional.of(stripper.getText(pdDoc));

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }

 }

 public static Optional<String> getDataAsString(final String fileName,
   final int startPage, final int endPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (startPage < 1 || endPage < 1 || endPage < startPage) {
   throw new IllegalArgumentException(
     "startPage, endPage must >= 1 and  endPage >= startPage");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {

   PDFTextStripper stripper = new PDFTextStripper();
   stripper.setLineSeparator("\n");
   stripper.setAddMoreFormatting(true);
   stripper.setStartPage(startPage);
   stripper.setEndPage(endPage);

   return Optional.of(stripper.getText(pdDoc));

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<String> getDataAsStringFromStartPage(
   String fileName, int startPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (startPage < 1) {
   throw new IllegalArgumentException("startPage must >= 1");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   int noOfPages = pdDoc.getNumberOfPages();
   return getDataAsString(fileName, startPage, noOfPages);

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<String> getDataAsStringTillEndPage(String fileName,
   int endPage) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (endPage < 1) {
   throw new IllegalArgumentException("endPage must >= 1");
  }

  return getDataAsString(fileName, 1, endPage);
 }

 public static Optional<Integer> getNumberOfPages(String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }
  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   return Optional.of(pdDoc.getNumberOfPages());
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<Map<String, Object>> getDocumentBasicMetaData(
   final String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
   Set<String> keys = docInfo.getMetadataKeys();

   Map<String, Object> map = new HashMap<>();

   for (String key : keys) {
    map.put(key, docInfo.getPropertyStringValue(key));
   }

   return Optional.of(map);

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static Optional<List<String>> getCatalogMetaData(
   final String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {
   PDDocumentCatalog catalog = pdDoc.getDocumentCatalog();
   PDMetadata metadata = catalog.getMetadata();
   return getMeatData(metadata);
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }

 }

 private static Optional<List<String>> getDataFromStream(InputStream in) {

  try (BufferedReader br = new BufferedReader(new InputStreamReader(in))) {
   List<String> data = new ArrayList<>();
   String str;

   while ((str = br.readLine()) != null) {
    data.add(str);
   }
   return Optional.of(data);
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }

 }

 private static Optional<List<String>> getMeatData(PDMetadata metadata) {
  if (metadata == null) {
   System.out.println("There is no meta data associated");
   return Optional.empty();
  }

  try (InputStream in = metadata.createInputStream()) {
   return getDataFromStream(in);
  } catch (IOException e) {
   return Optional.empty();
  }
 }

 public static Optional<List<String>> getPDPageMetaData(
   final String fileName, int pageIndex) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (pageIndex < 1) {
   throw new IllegalArgumentException("pageIndex must >= 1");
  }

  try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) {

   if (pageIndex > pdDoc.getNumberOfPages()) {
    throw new IllegalArgumentException("pageIndex : " + pageIndex
      + " must <= " + pdDoc.getNumberOfPages());
   }

   PDPage pdPage = pdDoc.getPage(pageIndex);
   PDMetadata metadata = pdPage.getMetadata();
   return getMeatData(metadata);
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }
 }

 public static boolean encryptDocument(final String fileName,
   final String encryptedFileName, final int keyLength,
   final String ownerPassword, final String userPassword) {

  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (Objects.isNull(encryptedFileName)) {
   throw new NullPointerException(
     "encryptedFileName shouldn't be null");
  }

  if (keyLength < 1) {
   throw new IllegalArgumentException("keyLength should > 0");
  }

  if (Objects.isNull(ownerPassword)) {
   throw new NullPointerException("ownerPassword shouldn't be null");
  }

  if (Objects.isNull(userPassword)) {
   throw new NullPointerException("userPassword shouldn't be null");
  }

  try (PDDocument doc = PDDocument.load(new File(fileName))) {
   AccessPermission ap = new AccessPermission();

   /* disable printing, everything else is allowed */
   ap.setCanPrint(false);

   StandardProtectionPolicy spp = new StandardProtectionPolicy(
     ownerPassword, userPassword, ap);

   /*
    * Define the length of the encryption key. Possible values are 40,
    * 128 256
    */
   spp.setEncryptionKeyLength(keyLength);
   spp.setPermissions(ap);
   doc.protect(spp);

   doc.save(encryptedFileName);
   return true;
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return false;
  }
 }

 public static boolean encryptDocument(final String fileName,
   final String encryptedFileName, final String ownerPassword,
   final String userPassword) {
  return encryptDocument(fileName, encryptedFileName, 128, ownerPassword,
    userPassword);
 }

 public static boolean encryptDocument(final String fileName,
   final String ownerPassword, final String userPassword) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  File file = new File(fileName);
  String encryptedFileName = "encrypted_" + file.getName();
  return encryptDocument(fileName, encryptedFileName, 128, ownerPassword,
    userPassword);
 }

 public static boolean addAtachement(final String fileName,
   final String... attachements) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (Objects.isNull(attachements)) {
   throw new NullPointerException("attachements shouldn't be null");
  }

  Map<String, PDComplexFileSpecification> efMap = new HashMap<>();

  try (PDDocument doc = PDDocument.load(new File(fileName))) {

   /*
    * Attachments are stored as part of the "names" dictionary in the
    * document catalog
    */
   PDDocumentNameDictionary names = new PDDocumentNameDictionary(
     doc.getDocumentCatalog());

   PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
   if (Objects.isNull(efTree)) {
    efTree = new PDEmbeddedFilesNameTreeNode();
   }
   Map<String, PDComplexFileSpecification> existedNames = efTree
     .getNames();
   efMap.putAll(existedNames);

   for (String attachement : attachements) {
    /* Create the file specification, which holds the embedded file */
    PDComplexFileSpecification fs = new PDComplexFileSpecification();

    fs.setFile(attachement);

    try (InputStream is = new FileInputStream(attachement)) {
     /* This represents an embedded file in a file specification */
     PDEmbeddedFile ef = new PDEmbeddedFile(doc, is);

     /* Set some relevant properties of embedded file */
     ef.setCreationDate(new GregorianCalendar());
     fs.setEmbeddedFile(ef);

     /*
      * now add the entry to the embedded file tree and set in
      * the document.
      */
     efMap.put(attachement, fs);
    }
   }

   efTree.setNames(efMap);
   names.setEmbeddedFiles(efTree);
   doc.getDocumentCatalog().setNames(names);
   doc.save(fileName);
   return true;
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return false;
  }

 }

 public static Optional<Set<String>> getAttachements(final String fileName) {

  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn'e be null");
  }

  try (final PDDocument doc = PDDocument.load(new File(fileName))) {

   /*
    * Attachments are stored as part of the "names" dictionary in the
    * document catalog
    */
   final PDDocumentNameDictionary names = new PDDocumentNameDictionary(
     doc.getDocumentCatalog());

   final PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
   if (Objects.isNull(efTree)) {
    return Optional.empty();
   }
   final Map<String, PDComplexFileSpecification> existedNames = efTree
     .getNames();

   return Optional.of(existedNames.keySet());

  } catch (IOException e) {
   System.out.println(e.getMessage());
   return Optional.empty();
  }

 }

 private static void copyFileToDirectory(final String srcFile,
   final String destDir) {
  try {
   FileUtils.copyFileToDirectory(new File(srcFile), new File(destDir));
   System.out.println("Copied file " + srcFile
     + " to the directory : " + destDir);
  } catch (IOException e) {
   System.out.println("File copying failed. src : " + srcFile
     + " Destination directory : " + destDir);
   System.out.println(e.getMessage());
  }
 }

 /**
  * Extract all attachments from fileName and save them to destDirectory.
  * 
  * @param fileName
  * @param destDirectory
  * @return
  */
 public static void extractAttachements(final String fileName,
   final String destDirectory) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  if (Objects.isNull(destDirectory)) {
   throw new NullPointerException("destDirectory shouldn't be null");
  }

  Optional<Set<String>> attachements = getAttachements(fileName);

  if (!attachements.isPresent()) {
   System.out.println("No attachements found");
   return;
  }

  Set<String> paths = attachements.get();
  for (String attachementPath : paths) {
   copyFileToDirectory(attachementPath, destDirectory);
  }
 }

 public static Optional<ValidationResult> getValidationResult(String fileName) {
  if (Objects.isNull(fileName)) {
   throw new NullPointerException("fileName shouldn't be null");
  }

  try {
   PreflightParser parser = new PreflightParser(fileName);

   parser.parse();

   try (PreflightDocument document = parser.getPreflightDocument()) {
    document.validate();
    ValidationResult result = document.getResult();
    return Optional.of(result);
   }

  } catch (IOException e) {
   return Optional.empty();
  }

 }

 /**
  * Return true if file is a valid PDF/A-1b file
  * 
  * @param fileName
  * @return
  */
 public static boolean isValidPDF(String fileName) {
  Optional<ValidationResult> validationResult = getValidationResult(fileName);

  if (!validationResult.isPresent()) {
   return false;
  }

  ValidationResult result = validationResult.get();
  if (result.isValid()) {
   return true;
  }

  return false;
 }
}




Previous                                                 Next                                                 Home

No comments:

Post a Comment