Monday, 7 March 2016

PDFBox: Extract all attachments from PDF document

In previous post, I explained how to get all the attachment names from a PDF document. In this post, I am going to explain how to extract and save the attachments to different directory.


Step 1: Load the PDF document.
PDDocument doc = PDDocument.load(new File(fileName))


Step 2: Attachments are stored as part of the "names" dictionary in the document catalog, So get a PDDocumentNameDictionary instance.
PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());


Step 3: Get all the embedded files.

PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
Map<String, PDComplexFileSpecification> existedNames = efTree.getNames();
List<String> attachments = existedNames.keySet();


Step 4: Once you got all the attachment locations, you can apply file copy operation to copy files to destination.

Following snippet is used to extract all attachments of a pdf to some directory.

public static Optional<Set<String>> getAttachements(final String fileName) {

 if (Objects.isNull(fileName)) {
  throw new NullPointerException("fileName shouldn'e be null");
 }

 try (final PDDocument doc = PDDocument.load(new File(fileName))) {

  /*
  * Attachments are stored as part of the "names" dictionary in the
  * document catalog
  */
  final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());

  final PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles();
  if (Objects.isNull(efTree)) {
   return Optional.empty();
  }
  final Map<String, PDComplexFileSpecification> existedNames = efTree.getNames();

  return Optional.of(existedNames.keySet());

 } catch (IOException e) {
  System.out.println(e.getMessage());
  return Optional.empty();
 }

}

private static void copyFileToDirectory(final String srcFile, final String destDir) {
 try {
  FileUtils.copyFileToDirectory(new File(srcFile), new File(destDir));
  System.out.println("Copied file " + srcFile + " to the directory : " + destDir);
 } catch (IOException e) {
  System.out.println("File copying failed. src : " + srcFile + " Destination directory : " + destDir);
  System.out.println(e.getMessage());
 }
}

/**
* Extract all attachments from fileName and save them to destDirectory.
* 
* @param fileName
* @param destDirectory
* @return
*/
public static void extractAttachements(final String fileName,
  final String destDirectory) {
 if (Objects.isNull(fileName)) {
  throw new NullPointerException("fileName shouldn't be null");
 }

 if (Objects.isNull(destDirectory)) {
  throw new NullPointerException("destDirectory shouldn't be null");
 }

 Optional<Set<String>> attachements = getAttachements(fileName);

 if (!attachements.isPresent()) {
  System.out.println("No attachements found");
  return;
 }

 Set<String> paths = attachements.get();
 for (String attachementPath : paths) {
  copyFileToDirectory(attachementPath, destDirectory);
 }
}

import java.io.IOException;

public class PDFTextStripperUtilTest {
 public static void main(String args[]) throws IOException {
  String fileName = "/Users/harikrishna_gurram/Downloads/Saurabh.pdf";
  String destDir = "/Users/harikrishna_gurram/temp_copy";

  PDFTextStripperUtil.extractAttachements(fileName, destDir);
 }
}


Go through following post, to get the code of complete utility class.




Previous                                                 Next                                                 Home

No comments:

Post a Comment