Following
utility class provides number of methods to work with PDF files. I used
following maven dependencies.
<dependencies> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.0-RC3</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox-lucene</artifactId> <version>1.8.11</version> </dependency> <dependency> <groupId>org.bouncycastle</groupId> <artifactId>bcprov-jdk16</artifactId> <version>1.46</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>preflight</artifactId> <version>2.0.0-RC3</version> </dependency> </dependencies>
Following is the brief summary of the
methods defined in the utility class.
Method
|
Description
|
public static Optional<String>
getDataAsString(final String fileName)
|
Return pdf file data as a string.
|
public static Optional<String>
getDataAsString(final String fileName,final int startPage, final int endPage)
|
Return pdf file data as a string from
given start page to end page.
|
public static Optional<String>
getDataAsStringFromStartPage
|
Return pdf file data as a string from
given page to the end.
|
public static Optional<String>
getDataAsStringTillEndPage(String fileName,int endPage)
|
Return pdf file data as a string from
startPage to the endPage.
|
public static Optional<Integer>
getNumberOfPages(String fileName)
|
Get number of pages.
|
public static
Optional<Map<String, Object>> getDocumentBasicMetaData(final String
fileName)
|
Get basic metadata of PDF document.
|
public static
Optional<List<String>> getCatalogMetaData(final String fileName)
|
Get catalog meta data.
|
private static
Optional<List<String>> getDataFromStream(InputStream in)
|
Read data from input stream.
|
private static
Optional<List<String>> getMeatData(PDMetadata metadata)
|
Get meta data from PDMetadata.
|
public static Optional<List<String>>
getPDPageMetaData(final String fileName, int pageIndex)
|
Get page metadata.
|
public static boolean
encryptDocument(final String fileName,final String encryptedFileName, final
int keyLength,final String ownerPassword, final String userPassword)
|
Encrypt PDF document.
|
public static boolean encryptDocument(final
String fileName,final String encryptedFileName, final String
ownerPassword,final String userPassword)
|
Encrypt PDF document. Uses keylength
128
|
public static boolean
encryptDocument(final String fileName,final String ownerPassword, final
String userPassword)
|
Encrypt PDF document. Uses keylength
128, result is stored in encrypted_ followed by fileName.pdf.
|
public static boolean
addAtachement(final String fileName,final String... attachements)
|
Add attachments to PDF file.
|
public static
Optional<Set<String>> getAttachements(final String fileName)
|
Return all attachments.
|
public static void
extractAttachements(final String fileName,final String destDirectory)
|
Extract attachments to destination
directory.
|
public static Optional<ValidationResult>
getValidationResult(String fileName)
|
Validate PDF Document.
|
public static boolean
isValidPDF(String fileName)
|
Return true, if PDF is valid PDF/A-1b
file, else false.
|
private static void
copyFileToDirectory(final String srcFile,final String destDir)
|
Copy file to destination directory.
|
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDMetadata; import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.StandardProtectionPolicy; import org.apache.pdfbox.preflight.PreflightDocument; import org.apache.pdfbox.preflight.ValidationResult; import org.apache.pdfbox.preflight.parser.PreflightParser; import org.apache.pdfbox.text.PDFTextStripper; /** * Utility class to work with PDF files * * @author harikrishna_gurram * */ public class PDFTextStripperUtil { /** * @param fileName * @return complete file data as string * @throws NullPointerException * if fileName is null */ public static Optional<String> getDataAsString(final String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.setAddMoreFormatting(true); return Optional.of(stripper.getText(pdDoc)); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static Optional<String> getDataAsString(final String fileName, final int startPage, final int endPage) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (startPage < 1 || endPage < 1 || endPage < startPage) { throw new IllegalArgumentException( "startPage, endPage must >= 1 and endPage >= startPage"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.setAddMoreFormatting(true); stripper.setStartPage(startPage); stripper.setEndPage(endPage); return Optional.of(stripper.getText(pdDoc)); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static Optional<String> getDataAsStringFromStartPage( String fileName, int startPage) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (startPage < 1) { throw new IllegalArgumentException("startPage must >= 1"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { int noOfPages = pdDoc.getNumberOfPages(); return getDataAsString(fileName, startPage, noOfPages); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static Optional<String> getDataAsStringTillEndPage(String fileName, int endPage) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (endPage < 1) { throw new IllegalArgumentException("endPage must >= 1"); } return getDataAsString(fileName, 1, endPage); } public static Optional<Integer> getNumberOfPages(String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { return Optional.of(pdDoc.getNumberOfPages()); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static Optional<Map<String, Object>> getDocumentBasicMetaData( final String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { PDDocumentInformation docInfo = pdDoc.getDocumentInformation(); Set<String> keys = docInfo.getMetadataKeys(); Map<String, Object> map = new HashMap<>(); for (String key : keys) { map.put(key, docInfo.getPropertyStringValue(key)); } return Optional.of(map); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static Optional<List<String>> getCatalogMetaData( final String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { PDDocumentCatalog catalog = pdDoc.getDocumentCatalog(); PDMetadata metadata = catalog.getMetadata(); return getMeatData(metadata); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } private static Optional<List<String>> getDataFromStream(InputStream in) { try (BufferedReader br = new BufferedReader(new InputStreamReader(in))) { List<String> data = new ArrayList<>(); String str; while ((str = br.readLine()) != null) { data.add(str); } return Optional.of(data); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } private static Optional<List<String>> getMeatData(PDMetadata metadata) { if (metadata == null) { System.out.println("There is no meta data associated"); return Optional.empty(); } try (InputStream in = metadata.createInputStream()) { return getDataFromStream(in); } catch (IOException e) { return Optional.empty(); } } public static Optional<List<String>> getPDPageMetaData( final String fileName, int pageIndex) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (pageIndex < 1) { throw new IllegalArgumentException("pageIndex must >= 1"); } try (final PDDocument pdDoc = PDDocument.load(new File(fileName))) { if (pageIndex > pdDoc.getNumberOfPages()) { throw new IllegalArgumentException("pageIndex : " + pageIndex + " must <= " + pdDoc.getNumberOfPages()); } PDPage pdPage = pdDoc.getPage(pageIndex); PDMetadata metadata = pdPage.getMetadata(); return getMeatData(metadata); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } public static boolean encryptDocument(final String fileName, final String encryptedFileName, final int keyLength, final String ownerPassword, final String userPassword) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (Objects.isNull(encryptedFileName)) { throw new NullPointerException( "encryptedFileName shouldn't be null"); } if (keyLength < 1) { throw new IllegalArgumentException("keyLength should > 0"); } if (Objects.isNull(ownerPassword)) { throw new NullPointerException("ownerPassword shouldn't be null"); } if (Objects.isNull(userPassword)) { throw new NullPointerException("userPassword shouldn't be null"); } try (PDDocument doc = PDDocument.load(new File(fileName))) { AccessPermission ap = new AccessPermission(); /* disable printing, everything else is allowed */ ap.setCanPrint(false); StandardProtectionPolicy spp = new StandardProtectionPolicy( ownerPassword, userPassword, ap); /* * Define the length of the encryption key. Possible values are 40, * 128 256 */ spp.setEncryptionKeyLength(keyLength); spp.setPermissions(ap); doc.protect(spp); doc.save(encryptedFileName); return true; } catch (IOException e) { System.out.println(e.getMessage()); return false; } } public static boolean encryptDocument(final String fileName, final String encryptedFileName, final String ownerPassword, final String userPassword) { return encryptDocument(fileName, encryptedFileName, 128, ownerPassword, userPassword); } public static boolean encryptDocument(final String fileName, final String ownerPassword, final String userPassword) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } File file = new File(fileName); String encryptedFileName = "encrypted_" + file.getName(); return encryptDocument(fileName, encryptedFileName, 128, ownerPassword, userPassword); } public static boolean addAtachement(final String fileName, final String... attachements) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (Objects.isNull(attachements)) { throw new NullPointerException("attachements shouldn't be null"); } Map<String, PDComplexFileSpecification> efMap = new HashMap<>(); try (PDDocument doc = PDDocument.load(new File(fileName))) { /* * Attachments are stored as part of the "names" dictionary in the * document catalog */ PDDocumentNameDictionary names = new PDDocumentNameDictionary( doc.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles(); if (Objects.isNull(efTree)) { efTree = new PDEmbeddedFilesNameTreeNode(); } Map<String, PDComplexFileSpecification> existedNames = efTree .getNames(); efMap.putAll(existedNames); for (String attachement : attachements) { /* Create the file specification, which holds the embedded file */ PDComplexFileSpecification fs = new PDComplexFileSpecification(); fs.setFile(attachement); try (InputStream is = new FileInputStream(attachement)) { /* This represents an embedded file in a file specification */ PDEmbeddedFile ef = new PDEmbeddedFile(doc, is); /* Set some relevant properties of embedded file */ ef.setCreationDate(new GregorianCalendar()); fs.setEmbeddedFile(ef); /* * now add the entry to the embedded file tree and set in * the document. */ efMap.put(attachement, fs); } } efTree.setNames(efMap); names.setEmbeddedFiles(efTree); doc.getDocumentCatalog().setNames(names); doc.save(fileName); return true; } catch (IOException e) { System.out.println(e.getMessage()); return false; } } public static Optional<Set<String>> getAttachements(final String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn'e be null"); } try (final PDDocument doc = PDDocument.load(new File(fileName))) { /* * Attachments are stored as part of the "names" dictionary in the * document catalog */ final PDDocumentNameDictionary names = new PDDocumentNameDictionary( doc.getDocumentCatalog()); final PDEmbeddedFilesNameTreeNode efTree = names.getEmbeddedFiles(); if (Objects.isNull(efTree)) { return Optional.empty(); } final Map<String, PDComplexFileSpecification> existedNames = efTree .getNames(); return Optional.of(existedNames.keySet()); } catch (IOException e) { System.out.println(e.getMessage()); return Optional.empty(); } } private static void copyFileToDirectory(final String srcFile, final String destDir) { try { FileUtils.copyFileToDirectory(new File(srcFile), new File(destDir)); System.out.println("Copied file " + srcFile + " to the directory : " + destDir); } catch (IOException e) { System.out.println("File copying failed. src : " + srcFile + " Destination directory : " + destDir); System.out.println(e.getMessage()); } } /** * Extract all attachments from fileName and save them to destDirectory. * * @param fileName * @param destDirectory * @return */ public static void extractAttachements(final String fileName, final String destDirectory) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } if (Objects.isNull(destDirectory)) { throw new NullPointerException("destDirectory shouldn't be null"); } Optional<Set<String>> attachements = getAttachements(fileName); if (!attachements.isPresent()) { System.out.println("No attachements found"); return; } Set<String> paths = attachements.get(); for (String attachementPath : paths) { copyFileToDirectory(attachementPath, destDirectory); } } public static Optional<ValidationResult> getValidationResult(String fileName) { if (Objects.isNull(fileName)) { throw new NullPointerException("fileName shouldn't be null"); } try { PreflightParser parser = new PreflightParser(fileName); parser.parse(); try (PreflightDocument document = parser.getPreflightDocument()) { document.validate(); ValidationResult result = document.getResult(); return Optional.of(result); } } catch (IOException e) { return Optional.empty(); } } /** * Return true if file is a valid PDF/A-1b file * * @param fileName * @return */ public static boolean isValidPDF(String fileName) { Optional<ValidationResult> validationResult = getValidationResult(fileName); if (!validationResult.isPresent()) { return false; } ValidationResult result = validationResult.get(); if (result.isValid()) { return true; } return false; } }
No comments:
Post a Comment