In previous post, I explained how to detect sentences. In this post, I am going to explain
how to train a model to detect sentences using openNLP.
Approach 1: Using command line interface.
opennlp
SentenceDetectorTrainer -model en-sent_trained.bin -lang en -data en-sent.train
-encoding UTF-8
Above
command is used to generate training model.
en-sent_trained.bin : Trained binary file.
en-sent.train : Sample training data file. The only requirement is
that each sentence should be on a separate line in the training file like below.
Sentence 1 Sentence 2 Sentence 3 …… ……
$ opennlp SentenceDetectorTrainer -model en-sent_trained.bin -lang en -data /Users/harikrishna_gurram/Desktop/training_data.txt -encoding UTF-8
Indexing events using cutoff of 5
Computing event counts... done. 173 events
Indexing... done.
Sorting and merging events... done. Reduced 173 events to 70.
Done indexing.
Incorporating indexed data for training...
done.
Number of Event Tokens: 70
Number of Outcomes: 2
Number of Predicates: 24
...done.
Computing model parameters ...
Performing 100 iterations.
1: ... loglikelihood=-119.91446223687062 0.9248554913294798
2: ... loglikelihood=-55.76980343694648 0.9248554913294798
3: ... loglikelihood=-49.1158495064482 0.9248554913294798
4: ... loglikelihood=-46.1065134982585 0.9248554913294798
5: ... loglikelihood=-43.844742929380935 0.9248554913294798
6: ... loglikelihood=-42.01639295714996 0.9248554913294798
7: ... loglikelihood=-40.51989933789465 0.9248554913294798
8: ... loglikelihood=-39.28113766380895 0.9248554913294798
9: ... loglikelihood=-38.24249324859753 0.9248554913294798
10: ... loglikelihood=-37.36057591536305 0.9248554913294798
11: ... loglikelihood=-36.60298843071962 0.9248554913294798
12: ... loglikelihood=-35.945381665934306 0.9248554913294798
13: ... loglikelihood=-35.36923040272217 0.9248554913294798
14: ... loglikelihood=-34.86024706593071 0.9248554913294798
15: ... loglikelihood=-34.407262894045616 0.9248554913294798
16: ... loglikelihood=-34.00143477708754 0.9248554913294798
17: ... loglikelihood=-33.63567698442831 0.9248554913294798
18: ... loglikelihood=-33.30424897078376 0.9248554913294798
19: ... loglikelihood=-33.00245256724807 0.9248554913294798
20: ... loglikelihood=-32.72640671310274 0.9248554913294798
21: ... loglikelihood=-32.47287781515418 0.9248554913294798
22: ... loglikelihood=-32.23915049691368 0.930635838150289
23: ... loglikelihood=-32.022928024091726 0.930635838150289
24: ... loglikelihood=-31.822254788124276 0.930635838150289
25: ... loglikelihood=-31.635455368588524 0.930635838150289
26: ... loglikelihood=-31.46108618931423 0.9364161849710982
27: ... loglikelihood=-31.29789683758349 0.9364161849710982
28: ... loglikelihood=-31.144798868314474 0.9364161849710982
29: ... loglikelihood=-31.000840457853734 0.9364161849710982
30: ... loglikelihood=-30.865185667562713 0.9364161849710982
31: ... loglikelihood=-30.737097368631694 0.9364161849710982
32: ... loglikelihood=-30.6159230961007 0.9364161849710982
33: ... loglikelihood=-30.50108326258004 0.9364161849710982
34: ... loglikelihood=-30.3920612852033 0.9364161849710982
35: ... loglikelihood=-30.288395273285126 0.9364161849710982
36: ... loglikelihood=-30.189670996439865 0.9364161849710982
37: ... loglikelihood=-30.095515908954855 0.9364161849710982
38: ... loglikelihood=-30.00559404995929 0.9364161849710982
39: ... loglikelihood=-29.919601673307763 0.9364161849710982
40: ... loglikelihood=-29.837263488284236 0.9421965317919075
41: ... loglikelihood=-29.758329413859432 0.9421965317919075
42: ... loglikelihood=-29.682571766535876 0.9421965317919075
43: ... loglikelihood=-29.609782815731158 0.9421965317919075
44: ... loglikelihood=-29.539772651900236 0.9421965317919075
45: ... loglikelihood=-29.472367321737572 0.9421965317919075
46: ... loglikelihood=-29.407407192260123 0.9421965317919075
47: ... loglikelihood=-29.344745511688778 0.9421965317919075
48: ... loglikelihood=-29.28424714008203 0.9421965317919075
49: ... loglikelihood=-29.22578742684012 0.9421965317919075
50: ... loglikelihood=-29.1692512156541 0.9421965317919075
51: ... loglikelihood=-29.114531960354388 0.9421965317919075
52: ... loglikelihood=-29.0615309375221 0.9421965317919075
53: ... loglikelihood=-29.01015654374704 0.9421965317919075
54: ... loglikelihood=-28.960323667118907 0.9421965317919075
55: ... loglikelihood=-28.91195312397489 0.9421965317919075
56: ... loglikelihood=-28.86497115314632 0.9421965317919075
57: ... loglikelihood=-28.819308960981054 0.9421965317919075
58: ... loglikelihood=-28.774902311302117 0.9421965317919075
59: ... loglikelihood=-28.73169115521662 0.9421965317919075
60: ... loglikelihood=-28.689619296336176 0.9421965317919075
61: ... loglikelihood=-28.64863408752512 0.9421965317919075
62: ... loglikelihood=-28.608686155771913 0.9421965317919075
63: ... loglikelihood=-28.569729152192306 0.9421965317919075
64: ... loglikelihood=-28.531719524530374 0.9421965317919075
65: ... loglikelihood=-28.494616309834303 0.9421965317919075
66: ... loglikelihood=-28.458380945253495 0.9421965317919075
67: ... loglikelihood=-28.422977095138542 0.9421965317919075
68: ... loglikelihood=-28.388370492831363 0.9421965317919075
69: ... loglikelihood=-28.354528795711865 0.9421965317919075
70: ... loglikelihood=-28.321421452225398 0.9421965317919075
71: ... loglikelihood=-28.289019579753322 0.9421965317919075
72: ... loglikelihood=-28.257295852310712 0.9421965317919075
73: ... loglikelihood=-28.22622439716231 0.9421965317919075
74: ... loglikelihood=-28.195780699542937 0.9421965317919075
75: ... loglikelihood=-28.165941514751758 0.9421965317919075
76: ... loglikelihood=-28.13668478696431 0.9421965317919075
77: ... loglikelihood=-28.107989574172024 0.9421965317919075
78: ... loglikelihood=-28.07983597871727 0.9421965317919075
79: ... loglikelihood=-28.052205082944273 0.9421965317919075
80: ... loglikelihood=-28.02507888953219 0.9421965317919075
81: ... loglikelihood=-27.998440266118852 0.9421965317919075
82: ... loglikelihood=-27.97227289385961 0.9421965317919075
83: ... loglikelihood=-27.946561219600014 0.9421965317919075
84: ... loglikelihood=-27.921290411369746 0.9421965317919075
85: ... loglikelihood=-27.896446316932657 0.9421965317919075
86: ... loglikelihood=-27.87201542515115 0.9421965317919075
87: ... loglikelihood=-27.847984829945073 0.9421965317919075
88: ... loglikelihood=-27.82434219664453 0.9421965317919075
89: ... loglikelihood=-27.801075730553457 0.9421965317919075
90: ... loglikelihood=-27.77817414755689 0.9421965317919075
91: ... loglikelihood=-27.755626646618794 0.9421965317919075
92: ... loglikelihood=-27.733422884030574 0.9421965317919075
93: ... loglikelihood=-27.71155294928182 0.9479768786127167
94: ... loglikelihood=-27.690007342435727 0.9479768786127167
95: ... loglikelihood=-27.66877695290108 0.9479768786127167
96: ... loglikelihood=-27.6478530395015 0.9479768786127167
97: ... loglikelihood=-27.627227211750785 0.9479768786127167
98: ... loglikelihood=-27.606891412250185 0.9479768786127167
99: ... loglikelihood=-27.58683790013019 0.9479768786127167
100: ... loglikelihood=-27.567059235465614 0.9479768786127167
Writing sentence detector model ... done (0.014s)
Wrote sentence detector model to
path: /Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-sent_trained.bin
Approach 2: Using Java API. Following is the complete working
Java Application.
import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.Objects; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.sentdetect.SentenceDetectorFactory; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.sentdetect.SentenceSampleStream; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; public class SentenceTrainer { /** * @param inputFile * contains training data * @param modelFile * Generated model file after training * @throws IOException */ public static void trainModel(String inputFile, String modelFile) throws IOException { Objects.nonNull(inputFile); Objects.nonNull(modelFile); MarkableFileInputStreamFactory factory = new MarkableFileInputStreamFactory( new File(inputFile)); Charset charset = Charset.forName("UTF-8"); ObjectStream<String> lineStream = new PlainTextByLineStream(factory, charset); ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream( lineStream); SentenceModel model; try { char[] eosCharacters = { '.', ';' }; SentenceDetectorFactory sentenceDetectorFactory = new SentenceDetectorFactory( "en", true, new Dictionary(), eosCharacters); model = SentenceDetectorME.train("en", sampleStream, sentenceDetectorFactory, TrainingParameters.defaultParams()); } finally { sampleStream.close(); } OutputStream modelOut = null; try { modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } } }
import java.io.IOException; public class Main { public static void main(String args[]) throws IOException { String inputFile = "/Users/harikrishna_gurram/Desktop/data.txt"; String modelFile = "/Users/harikrishna_gurram/Desktop/model.bin"; SentenceTrainer.trainModel(inputFile, modelFile); } }
No comments:
Post a Comment