By using
Name Finder, you can identify names, numbers in the text.
Name Finding using CLI (Command line interface)
Use ‘TokenNameFinder’
tool to identify names
$ opennlp TokenNameFinder Usage: opennlp TokenNameFinder model1 model2 ... modelN < sentences
First
download ‘en-ner-person.bin (Person name finder model.)’ file from following
location.
Run the
command ‘opennlp TokenNameFinder en-ner-person.bin <input.txt’.
Lets say ‘input.txt’
contains following data.
Pierre , 27
years old, is a software Engineer joined xyz organisation.
Mr . Vinken is the project manager, team size 10.
Mr . Vinken is the project manager, team size 10.
$ opennlp TokenNameFinder ./en-ner-person.bin <input.txt Loading Token Name Finder model ... done (0.673s) <START:person> Pierre <END> , 27 years old, is a software Engineer joined xyz organization. Mr . <START:person> Vinken <END> is the project manager, team size 10. Average: 272.7 sent/s Total: 3 sent Runtime: 0.011s
As you
observe, TokenNameFinder output the text with markup for person names.
Name Finding using Java API
import static java.nio.file.Files.readAllBytes; import static java.nio.file.Paths.get; import java.io.IOException; import java.util.Objects; public class FileUtils { /** * Get file data as string * * @param fileName * @return */ public static String getFileDataAsString(String fileName) { Objects.nonNull(fileName); try { String data = new String(readAllBytes(get(fileName))); return data; } catch (IOException e) { System.out.println(e.getMessage()); return null; } } }
import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Objects; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.tokenize.WhitespaceTokenizer; public class TokenizerUtil { TokenizerModel model = null; Tokenizer learnableTokenizer = null; public TokenizerUtil(String modelFile) { initTokenizerModel(modelFile); learnableTokenizer = new TokenizerME(model); } private void initTokenizerModel(String modelFile) { Objects.nonNull(modelFile); InputStream modelIn = null; try { modelIn = new FileInputStream(modelFile); } catch (FileNotFoundException e) { System.out.println(e.getMessage()); return; } try { model = new TokenizerModel(modelIn); } catch (IOException e) { e.printStackTrace(); } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } } public Tokenizer getLearnableTokenizer() { return learnableTokenizer; } public Tokenizer getWhitespaceTokenizer() { return WhitespaceTokenizer.INSTANCE; } public String[] tokenizeFileUsingLearnableTokenizer(String file) { String data = FileUtils.getFileDataAsString(file); return learnableTokenizer.tokenize(data); } public String[] tokenizeUsingLearnableTokenizer(String data) { return learnableTokenizer.tokenize(data); } public String[] tokenizeFileUsingWhiteSpaceTokenizer(String file) { String data = FileUtils.getFileDataAsString(file); return getWhitespaceTokenizer().tokenize(data); } public String[] tokenizeUsingWhiteSpaceTokenizer(String data) { return getWhitespaceTokenizer().tokenize(data); } }
import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.util.Span; public class NameFinderUtil { public static String[] getNames(String tokenizerModel, String nameModel, String inputFile) throws FileNotFoundException { InputStream modelIn = new FileInputStream(nameModel); TokenNameFinderModel model = null; try { model = new TokenNameFinderModel(modelIn); } catch (IOException e) { e.printStackTrace(); return null; } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException e) { } } } NameFinderME nameFinder = new NameFinderME(model); TokenizerUtil tokenizerUtil = new TokenizerUtil(tokenizerModel); String[] tokens = tokenizerUtil .tokenizeUsingLearnableTokenizer(FileUtils .getFileDataAsString(inputFile)); Span nameSpans[] = nameFinder.find(tokens); List<String> names = new ArrayList<>(); for (Span span : nameSpans) { int start = span.getStart(); int end = span.getEnd(); String temp = ""; for (int i = start; i < end; i++) { temp = temp + tokens[i]; } names.add(temp); } String[] temp = new String[names.size()]; return names.toArray(temp); } }
import java.io.IOException; public class Main { public static void main(String args[]) throws IOException { String tokenModelFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-token.bin"; String modelFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-ner-person.bin"; String inputFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/input.txt"; String[] result = NameFinderUtil.getNames(tokenModelFile, modelFile, inputFile); for(String s: result) System.out.println(s); } }
Output
Pierre Vinken
No comments:
Post a Comment