Following
are the utility programs to work with openNLP. You can use these programs as
basis for your application.
Class
|
Description
|
CategoryDetectorUtil.java
|
Identifies
category for given text.
|
CategoryTrainUtil.java
|
Train
model for category detection.
|
DeTokenizerUtil.java
|
Utility
class to form sentence from tokens.
|
FileUtils.java
|
Simple
utility class to convert file data to string.
|
NameFinderTrainUtil.java
|
Utility
class to train model for name finding.
|
NameFinderUtil.java
|
Utility
class to identify names.
|
POSTaggerUtil.java
|
Utility
class to identify Parts Of Speech such as Noun, Verb, Adjective etc.,
|
SentenceDetectorUtil.java
|
Utility
class to detect sentences.
|
SentenceTrainer.java
|
Utility
class to train model for detecting sentences.
|
TokenizerTrainer.java
|
Utility
class to train model to identify tokens.
|
TokenizerUtil.java
|
Utility
class to identify tokens
|
import java.io.FileInputStream; import java.io.InputStream; import java.util.Objects; import opennlp.tools.doccat.DoccatModel; import opennlp.tools.doccat.DocumentCategorizerME; /** * Class to identify category for given text * * @author harikrishna_gurram */ public class CategoryDetectorUtil { private InputStream inputStream; private DoccatModel docCatModel; private DocumentCategorizerME myCategorizer; /** * @param modelFile : ModelFile path */ public CategoryDetectorUtil(String modelFile) { Objects.nonNull(modelFile); initModel(modelFile); } private void initModel(String modelFile) { try { inputStream = new FileInputStream(modelFile); docCatModel = new DoccatModel(inputStream); myCategorizer = new DocumentCategorizerME(docCatModel); } catch (Exception e) { System.out.println(e.getMessage()); } } /** * Return the best suited category for given text * @param text * @return */ public String getCategory(String text) { double[] outcomes = myCategorizer.categorize(text); String category = myCategorizer.getBestCategory(outcomes); return category; } }
import java.io.*; import java.util.Objects; import opennlp.tools.doccat.DoccatFactory; import opennlp.tools.doccat.DoccatModel; import opennlp.tools.doccat.DocumentCategorizerME; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.doccat.DocumentSampleStream; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; /** * Train model for category detection * * @author harikrishna_gurram */ public class CategoryTrainUtil { /** * @param inputFile * : Contains training data * @param modelFile * : Model file contains data after training * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Objects.nonNull(inputFile); Objects.nonNull(modelFile); DoccatModel model = null; try { MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); ObjectStream<String> lineStream = new PlainTextByLineStream( factory, "UTF-8"); ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream( lineStream); model = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), new DoccatFactory()); OutputStream modelOut = null; File modelFileTmp = new File(modelFile); modelOut = new BufferedOutputStream(new FileOutputStream( modelFileTmp)); model.serialize(modelOut); } catch (IOException e) { e.printStackTrace(); } } }
import opennlp.tools.tokenize.DetokenizationDictionary; import opennlp.tools.tokenize.DetokenizationDictionary.Operation; import opennlp.tools.tokenize.DictionaryDetokenizer; /** * Utility class to form sentence back from tokens * * @author harikrishna_gurram */ public class DeTokenizerUtil { /** * @param tokens * String array of tokens * @param operation * can be any constnant * {@link DetokenizationDictionary.Operation.MOVE_BOTH}, * {@link DetokenizationDictionary.Operation.MOVE_LEFT}, * {@link DetokenizationDictionary.Operation.MOVE_RIGHT}, * {@link DetokenizationDictionary.Operation.RIGHT_LEFT_MATCHING} * @return */ public static String deTokenize(String[] tokens, DetokenizationDictionary.Operation operation) { Operation[] operations = new Operation[tokens.length]; for (int i = 0; i < tokens.length; i++) { operations[i] = operation; } DetokenizationDictionary dictionary = new DetokenizationDictionary( tokens, operations); DictionaryDetokenizer detokenizer = new DictionaryDetokenizer( dictionary); return detokenizer.detokenize(tokens, " "); } }
import static java.nio.file.Files.readAllBytes; import static java.nio.file.Paths.get; import java.io.IOException; import java.util.Objects; /** * Simple utility class to convert file data to string * * @author harikrishna_gurram */ public class FileUtils { /** * Get file data as string * * @param fileName * @return */ public static String getFileDataAsString(String fileName) { Objects.nonNull(fileName); try { String data = new String(readAllBytes(get(fileName))); return data; } catch (IOException e) { System.out.println(e.getMessage()); return null; } } }
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.NameSample; import opennlp.tools.namefind.NameSampleDataStream; import opennlp.tools.namefind.TokenNameFinderFactory; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; /** * Utility class to train model for name finding * * @author harikrishna_gurram */ public class NameFinderTrainUtil { /** * @param inputFile * : contains training data * @param modelFile * : Model file to save trained information * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Charset charset = Charset.forName("UTF-8"); MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); ObjectStream<String> lineStream = new PlainTextByLineStream(factory, charset); ObjectStream<NameSample> sampleStream = new NameSampleDataStream( lineStream); TokenNameFinderModel model; try { model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(), new TokenNameFinderFactory()); } finally { sampleStream.close(); } OutputStream modelOut = null; try { modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } } }
import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.util.Span; /** * Utility class to identify names. * * @author harikrishna_gurram */ public class NameFinderUtil { /** * Get all the names from given input file. * * @param tokenizerModel : Model file used to tokenize content of the file * @param nameModel : Model file used to identify names * @param inputFile : input file contains some data. * @return * @throws FileNotFoundException */ public static String[] getNames(String tokenizerModel, String nameModel, String inputFile) throws FileNotFoundException { InputStream modelIn = new FileInputStream(nameModel); TokenNameFinderModel model = null; try { model = new TokenNameFinderModel(modelIn); } catch (IOException e) { e.printStackTrace(); return null; } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } NameFinderME nameFinder = new NameFinderME(model); TokenizerUtil tokenizerUtil = new TokenizerUtil(tokenizerModel); String[] tokens = tokenizerUtil .tokenizeUsingLearnableTokenizer(FileUtils .getFileDataAsString(inputFile)); Span nameSpans[] = nameFinder.find(tokens); List<String> names = new ArrayList<>(); for (Span span : nameSpans) { int start = span.getStart(); int end = span.getEnd(); String temp = ""; for (int i = start; i < end; i++) { temp = temp + tokens[i]; } names.add(temp); } String[] temp = new String[names.size()]; return names.toArray(temp); } }
import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Objects; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; /** * Utility class to identify Parts Of Speech such as Noun, Verb, Adjective etc., * * @author harikrishna_gurram */ public class POSTaggerUtil { POSModel posModel = null; String modelFile = null; POSTaggerME tagger = null; TokenizerUtil tokenizerUtil = null; /** * * @param posModelFile * : Parts Of Speech model file name * @param tokenizerModelFile * : tokenizer model file name, used to tokenize contents of * sentence/file. */ public POSTaggerUtil(String posModelFile, String tokenizerModelFile) { Objects.nonNull(posModelFile); Objects.nonNull(tokenizerModelFile); modelFile = posModelFile; initModel(); tagger = new POSTaggerME(posModel); tokenizerUtil = new TokenizerUtil(tokenizerModelFile); } private void initModel() { InputStream modelIn = null; try { modelIn = new FileInputStream(modelFile); posModel = new POSModel(modelIn); } catch (IOException e) { System.out.println(e.getMessage()); } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } } /** * Get POS tags for given sentence * * @param sentence * @return */ public String[] getTags(String sentence) { String[] tokens = tokenizerUtil .tokenizeUsingWhiteSpaceTokenizer(sentence); return tagger.tag(tokens); } /** * Get POS tags for content of file * * @param fileName * @return */ public String[] getTagsForFile(String fileName) { Objects.nonNull(fileName); String data = FileUtils.getFileDataAsString(fileName); return getTags(data); } }
import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Objects; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; /** * Utility class to detect sentences * * @author harikrishna_gurram */ public class SentenceDetectorUtil { private SentenceModel model = null; SentenceDetectorME sentenceDetector = null; /** * @param modelFile * : modelf ile to detect sentences. */ public SentenceDetectorUtil(String modelFile) { Objects.nonNull(modelFile); initSentenceModel(modelFile); initSentenceDetectorME(); } private void initSentenceDetectorME() { sentenceDetector = new SentenceDetectorME(model); } private SentenceModel initSentenceModel(String file) { InputStream modelIn; try { modelIn = new FileInputStream(file); } catch (FileNotFoundException e) { System.out.println(e.getMessage()); return null; } try { model = new SentenceModel(modelIn); } catch (IOException e) { e.printStackTrace(); } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } return model; } /** * Return all sentences from given file. * * @param inputFile * @return */ public String[] getSentencesFromFile(String inputFile) { String data = FileUtils.getFileDataAsString(inputFile); return sentenceDetector.sentDetect(data); } /** * Return all sentences from given data. * * @param data * @return */ public String[] getSentences(String data) { return sentenceDetector.sentDetect(data); } }
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.Objects; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.sentdetect.SentenceDetectorFactory; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.sentdetect.SentenceSampleStream; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; /** * Utility class to train model for detecting sentences. * * @author harikrishna_gurram */ public class SentenceTrainer { /** * @param inputFile * contains training data * @param modelFile * Generated model file after training * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Objects.nonNull(inputFile); Objects.nonNull(modelFile); MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = new PlainTextByLineStream(factory, charset); ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream( lineStream); SentenceModel model; try { char[] eosCharacters = { '.', ';' }; SentenceDetectorFactory sentenceDetectorFactory = new SentenceDetectorFactory( "en", true, new Dictionary(), eosCharacters); model = SentenceDetectorME .train("en", sampleStream, sentenceDetectorFactory, TrainingParameters.defaultParams()); } finally { sampleStream.close(); } OutputStream modelOut = null; try { modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } } }
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.Objects; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.tokenize.TokenSampleStream; import opennlp.tools.tokenize.TokenizerFactory; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; /** * Utility class to train model to identify tokens. * * @author harikrishna_gurram */ public class TokenizerTrainer { /** * @param inputFile * contains training data * @param modelFile * Generated model file after training * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Objects.nonNull(inputFile); Objects.nonNull(modelFile); Charset charset = Charset.forName("UTF-8"); MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); ObjectStream<String> lineStream = new PlainTextByLineStream(factory, charset); ObjectStream<TokenSample> sampleStream = new TokenSampleStream( lineStream); TokenizerModel model; try { TokenizerFactory tokenizerFactory = new TokenizerFactory("en", new Dictionary(), false, null); model = TokenizerME.train(sampleStream, tokenizerFactory, TrainingParameters.defaultParams()); } finally { sampleStream.close(); } OutputStream modelOut = null; try { modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } } }
import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Objects; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.tokenize.WhitespaceTokenizer; /** * Utility class to identify tokens * * @author harikrishna_gurram */ public class TokenizerUtil { TokenizerModel model = null; Tokenizer learnableTokenizer = null; public TokenizerUtil(String modelFile) { initTokenizerModel(modelFile); learnableTokenizer = new TokenizerME(model); } private void initTokenizerModel(String modelFile) { Objects.nonNull(modelFile); InputStream modelIn = null; try { modelIn = new FileInputStream(modelFile); } catch (FileNotFoundException e) { System.out.println(e.getMessage()); return; } try { model = new TokenizerModel(modelIn); } catch (IOException e) { e.printStackTrace(); } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } } public Tokenizer getLearnableTokenizer() { return learnableTokenizer; } public Tokenizer getWhitespaceTokenizer() { return WhitespaceTokenizer.INSTANCE; } public String[] tokenizeFileUsingLearnableTokenizer(String file) { String data = FileUtils.getFileDataAsString(file); return learnableTokenizer.tokenize(data); } public String[] tokenizeUsingLearnableTokenizer(String data) { return learnableTokenizer.tokenize(data); } public String[] tokenizeFileUsingWhiteSpaceTokenizer(String file) { String data = FileUtils.getFileDataAsString(file); return getWhitespaceTokenizer().tokenize(data); } public String[] tokenizeUsingWhiteSpaceTokenizer(String data) { return getWhitespaceTokenizer().tokenize(data); } }
No comments:
Post a Comment