Thursday, 1 October 2015

openNLP: utility programs

Following are the utility programs to work with openNLP. You can use these programs as basis for your application.


Class
Description
CategoryDetectorUtil.java
Identifies category for given text.
CategoryTrainUtil.java
Train model for category detection.
DeTokenizerUtil.java
Utility class to form sentence from tokens.
FileUtils.java
Simple utility class to convert file data to string.
NameFinderTrainUtil.java
Utility class to train model for name finding.
NameFinderUtil.java
Utility class to identify names.
POSTaggerUtil.java
Utility class to identify Parts Of Speech such as Noun, Verb, Adjective etc.,
SentenceDetectorUtil.java
Utility class to detect sentences.
SentenceTrainer.java
Utility class to train model for detecting sentences.
TokenizerTrainer.java
Utility class to train model to identify tokens.
TokenizerUtil.java
Utility class to identify tokens


import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Objects;

import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;

/**
 * Class to identify category for given text
 * 
 * @author harikrishna_gurram
 */
public class CategoryDetectorUtil {
 private InputStream inputStream;
 private DoccatModel docCatModel;
 private DocumentCategorizerME myCategorizer;

 /**
  * @param modelFile : ModelFile path
  */
 public CategoryDetectorUtil(String modelFile) {
  Objects.nonNull(modelFile);
  initModel(modelFile);
 }
 

 private void initModel(String modelFile) {
  try {
   inputStream = new FileInputStream(modelFile);
   docCatModel = new DoccatModel(inputStream);
   myCategorizer = new DocumentCategorizerME(docCatModel);
  } catch (Exception e) {
   System.out.println(e.getMessage());
  }

 }

 /**
  * Return the best suited category for given text
  * @param text
  * @return
  */
 public String getCategory(String text) {
  double[] outcomes = myCategorizer.categorize(text);
  String category = myCategorizer.getBestCategory(outcomes);
  return category;
 }
}


import java.io.*;
import java.util.Objects;

import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/**
 * Train model for category detection
 * 
 * @author harikrishna_gurram
 */
public class CategoryTrainUtil {

 /**
  * @param inputFile
  *            : Contains training data
  * @param modelFile
  *            : Model file contains data after training
  * @throws IOException
  */
 public static void trainModel(String inputFile, String modelFile)
   throws IOException {
  Objects.nonNull(inputFile);
  Objects.nonNull(modelFile);

  DoccatModel model = null;

  try {

   MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory(
     new File(inputFile));
   ObjectStream<String> lineStream = new PlainTextByLineStream(
     factory, "UTF-8");

   ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(
     lineStream);

   model = DocumentCategorizerME.train("en", sampleStream,
     TrainingParameters.defaultParams(), new DoccatFactory());

   OutputStream modelOut = null;
   File modelFileTmp = new File(modelFile);
   modelOut = new BufferedOutputStream(new FileOutputStream(
     modelFileTmp));
   model.serialize(modelOut);
  } catch (IOException e) {
   e.printStackTrace();
  }

 }
}


import opennlp.tools.tokenize.DetokenizationDictionary;
import opennlp.tools.tokenize.DetokenizationDictionary.Operation;
import opennlp.tools.tokenize.DictionaryDetokenizer;

/**
 * Utility class to form sentence back from tokens
 * 
 * @author harikrishna_gurram
 */
public class DeTokenizerUtil {

 /**
  * @param tokens
  *            String array of tokens
  * @param operation
  *            can be any constnant
  *            {@link DetokenizationDictionary.Operation.MOVE_BOTH},
  *            {@link DetokenizationDictionary.Operation.MOVE_LEFT},
  *            {@link DetokenizationDictionary.Operation.MOVE_RIGHT},
  *            {@link DetokenizationDictionary.Operation.RIGHT_LEFT_MATCHING}
  * @return
  */
 public static String deTokenize(String[] tokens,
   DetokenizationDictionary.Operation operation) {
  Operation[] operations = new Operation[tokens.length];

  for (int i = 0; i < tokens.length; i++) {
   operations[i] = operation;
  }

  DetokenizationDictionary dictionary = new DetokenizationDictionary(
    tokens, operations);
  DictionaryDetokenizer detokenizer = new DictionaryDetokenizer(
    dictionary);

  return detokenizer.detokenize(tokens, " ");
 }
}


import static java.nio.file.Files.readAllBytes;
import static java.nio.file.Paths.get;

import java.io.IOException;
import java.util.Objects;

/**
 * Simple utility class to convert file data to string
 * 
 * @author harikrishna_gurram
 */
public class FileUtils {
 /**
  * Get file data as string
  * 
  * @param fileName
  * @return
  */
 public static String getFileDataAsString(String fileName) {
  Objects.nonNull(fileName);
  try {
   String data = new String(readAllBytes(get(fileName)));
   return data;
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return null;
  }
 }
}


import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderFactory;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/**
 * Utility class to train model for name finding
 * 
 * @author harikrishna_gurram
 */
public class NameFinderTrainUtil {

 /**
  * @param inputFile
  *            : contains training data
  * @param modelFile
  *            : Model file to save trained information
  * @throws IOException
  */
 public static void trainModel(String inputFile, String modelFile)
   throws IOException {
  Charset charset = Charset.forName("UTF-8");

  MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory(
    new File(inputFile));
  ObjectStream<String> lineStream = new PlainTextByLineStream(factory,
    charset);

  ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
    lineStream);

  TokenNameFinderModel model;

  try {
   model = NameFinderME.train("en", "person", sampleStream,
     TrainingParameters.defaultParams(),
     new TokenNameFinderFactory());
  } finally {
   sampleStream.close();
  }

  OutputStream modelOut = null;
  try {
   modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
   model.serialize(modelOut);
  } finally {
   if (modelOut != null)
    modelOut.close();
  }
 }
}


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;

/**
 * Utility class to identify names.
 * 
 * @author harikrishna_gurram
 */
public class NameFinderUtil {

 /**
  * Get all the names from given input file.
  * 
  * @param tokenizerModel : Model file used to tokenize content of the file
  * @param nameModel : Model file used to identify names
  * @param inputFile : input file contains some data.
  * @return
  * @throws FileNotFoundException
  */
 public static String[] getNames(String tokenizerModel, String nameModel,
   String inputFile) throws FileNotFoundException {

  InputStream modelIn = new FileInputStream(nameModel);
  TokenNameFinderModel model = null;

  try {
   model = new TokenNameFinderModel(modelIn);
  } catch (IOException e) {
   e.printStackTrace();
   return null;
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }
  

  NameFinderME nameFinder = new NameFinderME(model);

  TokenizerUtil tokenizerUtil = new TokenizerUtil(tokenizerModel);
  String[] tokens = tokenizerUtil
    .tokenizeUsingLearnableTokenizer(FileUtils
      .getFileDataAsString(inputFile));

  Span nameSpans[] = nameFinder.find(tokens);

  List<String> names = new ArrayList<>();

  for (Span span : nameSpans) {
   int start = span.getStart();
   int end = span.getEnd();

   String temp = "";
   for (int i = start; i < end; i++) {
    temp = temp + tokens[i];
   }

   names.add(temp);
  }
  String[] temp = new String[names.size()];

  return names.toArray(temp);
 }
}


import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;

import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;

/**
 * Utility class to identify Parts Of Speech such as Noun, Verb, Adjective etc.,
 * 
 * @author harikrishna_gurram
 */
public class POSTaggerUtil {

 POSModel posModel = null;
 String modelFile = null;
 POSTaggerME tagger = null;
 TokenizerUtil tokenizerUtil = null;

 /**
  * 
  * @param posModelFile
  *            : Parts Of Speech model file name
  * @param tokenizerModelFile
  *            : tokenizer model file name, used to tokenize contents of
  *            sentence/file.
  */
 public POSTaggerUtil(String posModelFile, String tokenizerModelFile) {
  Objects.nonNull(posModelFile);
  Objects.nonNull(tokenizerModelFile);
  modelFile = posModelFile;
  initModel();
  tagger = new POSTaggerME(posModel);
  tokenizerUtil = new TokenizerUtil(tokenizerModelFile);
 }

 private void initModel() {
  InputStream modelIn = null;

  try {
   modelIn = new FileInputStream(modelFile);
   posModel = new POSModel(modelIn);
  } catch (IOException e) {
   System.out.println(e.getMessage());
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }
 }

 /**
  * Get POS tags for given sentence
  * 
  * @param sentence
  * @return
  */
 public String[] getTags(String sentence) {
  String[] tokens = tokenizerUtil
    .tokenizeUsingWhiteSpaceTokenizer(sentence);
  return tagger.tag(tokens);
 }

 /**
  * Get POS tags for content of file
  * 
  * @param fileName
  * @return
  */
 public String[] getTagsForFile(String fileName) {
  Objects.nonNull(fileName);
  String data = FileUtils.getFileDataAsString(fileName);
  return getTags(data);
 }

}


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;

import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

/**
 * Utility class to detect sentences
 * 
 * @author harikrishna_gurram
 */
public class SentenceDetectorUtil {
 private SentenceModel model = null;
 SentenceDetectorME sentenceDetector = null;

 /**
  * @param modelFile
  *            : modelf ile to detect sentences.
  */
 public SentenceDetectorUtil(String modelFile) {
  Objects.nonNull(modelFile);
  initSentenceModel(modelFile);
  initSentenceDetectorME();
 }

 private void initSentenceDetectorME() {
  sentenceDetector = new SentenceDetectorME(model);
 }

 private SentenceModel initSentenceModel(String file) {
  InputStream modelIn;
  try {
   modelIn = new FileInputStream(file);
  } catch (FileNotFoundException e) {
   System.out.println(e.getMessage());
   return null;
  }

  try {
   model = new SentenceModel(modelIn);
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }
  return model;
 }

 /**
  * Return all sentences from given file.
  * 
  * @param inputFile
  * @return
  */
 public String[] getSentencesFromFile(String inputFile) {
  String data = FileUtils.getFileDataAsString(inputFile);
  return sentenceDetector.sentDetect(data);
 }

 /**
  * Return all sentences from given data.
  * 
  * @param data
  * @return
  */
 public String[] getSentences(String data) {
  return sentenceDetector.sentDetect(data);
 }

}


import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Objects;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/**
 * Utility class to train model for detecting sentences.
 * 
 * @author harikrishna_gurram
 */
public class SentenceTrainer {

 /**
  * @param inputFile
  *            contains training data
  * @param modelFile
  *            Generated model file after training
  * @throws IOException
  */
 public static void trainModel(String inputFile, String modelFile)
   throws IOException {
  Objects.nonNull(inputFile);
  Objects.nonNull(modelFile);

  MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory(
    new File(inputFile));

  Charset charset = Charset.forName("UTF-8");
  ObjectStream<String> lineStream = new PlainTextByLineStream(factory,
    charset);
  ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(
    lineStream);

  SentenceModel model;

  try {
   char[] eosCharacters = { '.', ';' };
   SentenceDetectorFactory sentenceDetectorFactory = new SentenceDetectorFactory(
     "en", true, new Dictionary(), eosCharacters);
   model = SentenceDetectorME
     .train("en", sampleStream, sentenceDetectorFactory,
       TrainingParameters.defaultParams());
  } finally {
   sampleStream.close();
  }

  OutputStream modelOut = null;
  try {
   modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
   model.serialize(modelOut);
  } finally {
   if (modelOut != null)
    modelOut.close();
  }
 }
}


import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Objects;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/**
 * Utility class to train model to identify tokens.
 * 
 * @author harikrishna_gurram
 */
public class TokenizerTrainer {
 /**
  * @param inputFile
  *            contains training data
  * @param modelFile
  *            Generated model file after training
  * @throws IOException
  */
 public static void trainModel(String inputFile, String modelFile)
   throws IOException {
  Objects.nonNull(inputFile);
  Objects.nonNull(modelFile);

  Charset charset = Charset.forName("UTF-8");

  MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory(
    new File(inputFile));
  ObjectStream<String> lineStream = new PlainTextByLineStream(factory,
    charset);
  ObjectStream<TokenSample> sampleStream = new TokenSampleStream(
    lineStream);

  TokenizerModel model;

  try {
   TokenizerFactory tokenizerFactory = new TokenizerFactory("en",
     new Dictionary(), false, null);

   model = TokenizerME.train(sampleStream, tokenizerFactory,
     TrainingParameters.defaultParams());
  } finally {
   sampleStream.close();
  }

  OutputStream modelOut = null;
  try {
   modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
   model.serialize(modelOut);
  } finally {
   if (modelOut != null)
    modelOut.close();
  }

 }
}


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;

import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;

/**
 * Utility class to identify tokens
 * 
 * @author harikrishna_gurram
 */
public class TokenizerUtil {
 TokenizerModel model = null;
 Tokenizer learnableTokenizer = null;

 public TokenizerUtil(String modelFile) {
  initTokenizerModel(modelFile);
  learnableTokenizer = new TokenizerME(model);
 }

 private void initTokenizerModel(String modelFile) {
  Objects.nonNull(modelFile);

  InputStream modelIn = null;
  try {
   modelIn = new FileInputStream(modelFile);
  } catch (FileNotFoundException e) {
   System.out.println(e.getMessage());
   return;
  }

  try {
   model = new TokenizerModel(modelIn);
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }
 }

 public Tokenizer getLearnableTokenizer() {
  return learnableTokenizer;
 }

 public Tokenizer getWhitespaceTokenizer() {
  return WhitespaceTokenizer.INSTANCE;
 }

 public String[] tokenizeFileUsingLearnableTokenizer(String file) {
  String data = FileUtils.getFileDataAsString(file);
  return learnableTokenizer.tokenize(data);
 }

 public String[] tokenizeUsingLearnableTokenizer(String data) {
  return learnableTokenizer.tokenize(data);
 }

 public String[] tokenizeFileUsingWhiteSpaceTokenizer(String file) {
  String data = FileUtils.getFileDataAsString(file);
  return getWhitespaceTokenizer().tokenize(data);
 }

 public String[] tokenizeUsingWhiteSpaceTokenizer(String data) {
  return getWhitespaceTokenizer().tokenize(data);
 }
}

Prevoius                                                 Next                                                 Home

No comments:

Post a Comment