In previous post, I explained how to detect sentences. In this post, I am going to explain
how to train a model to detect sentences using openNLP.
Approach 1: Using command line interface.
opennlp
SentenceDetectorTrainer -model en-sent_trained.bin -lang en -data en-sent.train
-encoding UTF-8
Above
command is used to generate training model.
en-sent_trained.bin : Trained binary file.
en-sent.train : Sample training data file. The only requirement is
that each sentence should be on a separate line in the training file like below.
Sentence 1 Sentence 2 Sentence 3 …… ……
$ opennlp SentenceDetectorTrainer -model en-sent_trained.bin -lang en -data /Users/harikrishna_gurram/Desktop/training_data.txt -encoding UTF-8 Indexing events using cutoff of 5 Computing event counts... done. 173 events Indexing... done. Sorting and merging events... done. Reduced 173 events to 70. Done indexing. Incorporating indexed data for training... done. Number of Event Tokens: 70 Number of Outcomes: 2 Number of Predicates: 24 ...done. Computing model parameters ... Performing 100 iterations. 1: ... loglikelihood=-119.91446223687062 0.9248554913294798 2: ... loglikelihood=-55.76980343694648 0.9248554913294798 3: ... loglikelihood=-49.1158495064482 0.9248554913294798 4: ... loglikelihood=-46.1065134982585 0.9248554913294798 5: ... loglikelihood=-43.844742929380935 0.9248554913294798 6: ... loglikelihood=-42.01639295714996 0.9248554913294798 7: ... loglikelihood=-40.51989933789465 0.9248554913294798 8: ... loglikelihood=-39.28113766380895 0.9248554913294798 9: ... loglikelihood=-38.24249324859753 0.9248554913294798 10: ... loglikelihood=-37.36057591536305 0.9248554913294798 11: ... loglikelihood=-36.60298843071962 0.9248554913294798 12: ... loglikelihood=-35.945381665934306 0.9248554913294798 13: ... loglikelihood=-35.36923040272217 0.9248554913294798 14: ... loglikelihood=-34.86024706593071 0.9248554913294798 15: ... loglikelihood=-34.407262894045616 0.9248554913294798 16: ... loglikelihood=-34.00143477708754 0.9248554913294798 17: ... loglikelihood=-33.63567698442831 0.9248554913294798 18: ... loglikelihood=-33.30424897078376 0.9248554913294798 19: ... loglikelihood=-33.00245256724807 0.9248554913294798 20: ... loglikelihood=-32.72640671310274 0.9248554913294798 21: ... loglikelihood=-32.47287781515418 0.9248554913294798 22: ... loglikelihood=-32.23915049691368 0.930635838150289 23: ... loglikelihood=-32.022928024091726 0.930635838150289 24: ... loglikelihood=-31.822254788124276 0.930635838150289 25: ... loglikelihood=-31.635455368588524 0.930635838150289 26: ... loglikelihood=-31.46108618931423 0.9364161849710982 27: ... loglikelihood=-31.29789683758349 0.9364161849710982 28: ... loglikelihood=-31.144798868314474 0.9364161849710982 29: ... loglikelihood=-31.000840457853734 0.9364161849710982 30: ... loglikelihood=-30.865185667562713 0.9364161849710982 31: ... loglikelihood=-30.737097368631694 0.9364161849710982 32: ... loglikelihood=-30.6159230961007 0.9364161849710982 33: ... loglikelihood=-30.50108326258004 0.9364161849710982 34: ... loglikelihood=-30.3920612852033 0.9364161849710982 35: ... loglikelihood=-30.288395273285126 0.9364161849710982 36: ... loglikelihood=-30.189670996439865 0.9364161849710982 37: ... loglikelihood=-30.095515908954855 0.9364161849710982 38: ... loglikelihood=-30.00559404995929 0.9364161849710982 39: ... loglikelihood=-29.919601673307763 0.9364161849710982 40: ... loglikelihood=-29.837263488284236 0.9421965317919075 41: ... loglikelihood=-29.758329413859432 0.9421965317919075 42: ... loglikelihood=-29.682571766535876 0.9421965317919075 43: ... loglikelihood=-29.609782815731158 0.9421965317919075 44: ... loglikelihood=-29.539772651900236 0.9421965317919075 45: ... loglikelihood=-29.472367321737572 0.9421965317919075 46: ... loglikelihood=-29.407407192260123 0.9421965317919075 47: ... loglikelihood=-29.344745511688778 0.9421965317919075 48: ... loglikelihood=-29.28424714008203 0.9421965317919075 49: ... loglikelihood=-29.22578742684012 0.9421965317919075 50: ... loglikelihood=-29.1692512156541 0.9421965317919075 51: ... loglikelihood=-29.114531960354388 0.9421965317919075 52: ... loglikelihood=-29.0615309375221 0.9421965317919075 53: ... loglikelihood=-29.01015654374704 0.9421965317919075 54: ... loglikelihood=-28.960323667118907 0.9421965317919075 55: ... loglikelihood=-28.91195312397489 0.9421965317919075 56: ... loglikelihood=-28.86497115314632 0.9421965317919075 57: ... loglikelihood=-28.819308960981054 0.9421965317919075 58: ... loglikelihood=-28.774902311302117 0.9421965317919075 59: ... loglikelihood=-28.73169115521662 0.9421965317919075 60: ... loglikelihood=-28.689619296336176 0.9421965317919075 61: ... loglikelihood=-28.64863408752512 0.9421965317919075 62: ... loglikelihood=-28.608686155771913 0.9421965317919075 63: ... loglikelihood=-28.569729152192306 0.9421965317919075 64: ... loglikelihood=-28.531719524530374 0.9421965317919075 65: ... loglikelihood=-28.494616309834303 0.9421965317919075 66: ... loglikelihood=-28.458380945253495 0.9421965317919075 67: ... loglikelihood=-28.422977095138542 0.9421965317919075 68: ... loglikelihood=-28.388370492831363 0.9421965317919075 69: ... loglikelihood=-28.354528795711865 0.9421965317919075 70: ... loglikelihood=-28.321421452225398 0.9421965317919075 71: ... loglikelihood=-28.289019579753322 0.9421965317919075 72: ... loglikelihood=-28.257295852310712 0.9421965317919075 73: ... loglikelihood=-28.22622439716231 0.9421965317919075 74: ... loglikelihood=-28.195780699542937 0.9421965317919075 75: ... loglikelihood=-28.165941514751758 0.9421965317919075 76: ... loglikelihood=-28.13668478696431 0.9421965317919075 77: ... loglikelihood=-28.107989574172024 0.9421965317919075 78: ... loglikelihood=-28.07983597871727 0.9421965317919075 79: ... loglikelihood=-28.052205082944273 0.9421965317919075 80: ... loglikelihood=-28.02507888953219 0.9421965317919075 81: ... loglikelihood=-27.998440266118852 0.9421965317919075 82: ... loglikelihood=-27.97227289385961 0.9421965317919075 83: ... loglikelihood=-27.946561219600014 0.9421965317919075 84: ... loglikelihood=-27.921290411369746 0.9421965317919075 85: ... loglikelihood=-27.896446316932657 0.9421965317919075 86: ... loglikelihood=-27.87201542515115 0.9421965317919075 87: ... loglikelihood=-27.847984829945073 0.9421965317919075 88: ... loglikelihood=-27.82434219664453 0.9421965317919075 89: ... loglikelihood=-27.801075730553457 0.9421965317919075 90: ... loglikelihood=-27.77817414755689 0.9421965317919075 91: ... loglikelihood=-27.755626646618794 0.9421965317919075 92: ... loglikelihood=-27.733422884030574 0.9421965317919075 93: ... loglikelihood=-27.71155294928182 0.9479768786127167 94: ... loglikelihood=-27.690007342435727 0.9479768786127167 95: ... loglikelihood=-27.66877695290108 0.9479768786127167 96: ... loglikelihood=-27.6478530395015 0.9479768786127167 97: ... loglikelihood=-27.627227211750785 0.9479768786127167 98: ... loglikelihood=-27.606891412250185 0.9479768786127167 99: ... loglikelihood=-27.58683790013019 0.9479768786127167 100: ... loglikelihood=-27.567059235465614 0.9479768786127167 Writing sentence detector model ... done (0.014s) Wrote sentence detector model to path: /Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-sent_trained.bin
Approach 2: Using Java API. Following is the complete working
Java Application.
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.Objects; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.sentdetect.SentenceDetectorFactory; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.sentdetect.SentenceSampleStream; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; public class SentenceTrainer { /** * @param inputFile * contains training data * @param modelFile * Generated model file after training * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Objects.nonNull(inputFile); Objects.nonNull(modelFile); MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = new PlainTextByLineStream(factory, charset); ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream( lineStream); SentenceModel model; try { char[] eosCharacters = { '.', ';' }; SentenceDetectorFactory sentenceDetectorFactory = new SentenceDetectorFactory( "en", true, new Dictionary(), eosCharacters); model = SentenceDetectorME.train("en", sampleStream, sentenceDetectorFactory, TrainingParameters.defaultParams()); } finally { sampleStream.close(); } OutputStream modelOut = null; try { modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } } }
import java.io.IOException; public class Main { public static void main(String args[]) throws IOException { String inputFile = "/Users/harikrishna_gurram/Desktop/data.txt"; String modelFile = "/Users/harikrishna_gurram/Desktop/model.bin"; SentenceTrainer.trainModel(inputFile, modelFile); } }
No comments:
Post a Comment