Thursday, 1 October 2015

openNLP: Name Finder


By using Name Finder, you can identify names, numbers in the text.

Name Finding using CLI (Command line interface)
Use ‘TokenNameFinder’ tool to identify names

$ opennlp TokenNameFinder
Usage: opennlp TokenNameFinder model1 model2 ... modelN < sentences

First download ‘en-ner-person.bin (Person name finder model.)’ file from following location.

Run the command ‘opennlp TokenNameFinder en-ner-person.bin <input.txt’.

Lets say ‘input.txt’ contains following data.
Pierre , 27 years old, is a software Engineer joined xyz organisation.
Mr . Vinken is the project manager, team size 10.

$ opennlp TokenNameFinder ./en-ner-person.bin <input.txt
Loading Token Name Finder model ... done (0.673s)

<START:person> Pierre <END> , 27 years old, is a software Engineer joined xyz organization.
Mr . <START:person> Vinken <END> is the project manager, team size 10.


Average: 272.7 sent/s 
Total: 3 sent
Runtime: 0.011s

As you observe, TokenNameFinder output the text with markup for person names.


Name Finding using Java API

import static java.nio.file.Files.readAllBytes;
import static java.nio.file.Paths.get;

import java.io.IOException;
import java.util.Objects;

public class FileUtils {
 /**
  * Get file data as string
  * 
  * @param fileName
  * @return
  */
 public static String getFileDataAsString(String fileName) {
  Objects.nonNull(fileName);
  try {
   String data = new String(readAllBytes(get(fileName)));
   return data;
  } catch (IOException e) {
   System.out.println(e.getMessage());
   return null;
  }
 }
}


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;

import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;

public class TokenizerUtil {
 TokenizerModel model = null;
 Tokenizer learnableTokenizer = null;

 public TokenizerUtil(String modelFile) {
  initTokenizerModel(modelFile);
  learnableTokenizer = new TokenizerME(model);
 }

 private void initTokenizerModel(String modelFile) {
  Objects.nonNull(modelFile);

  InputStream modelIn = null;
  try {
   modelIn = new FileInputStream(modelFile);
  } catch (FileNotFoundException e) {
   System.out.println(e.getMessage());
   return;
  }

  try {
   model = new TokenizerModel(modelIn);
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }
 }

 public Tokenizer getLearnableTokenizer() {
  return learnableTokenizer;
 }

 public Tokenizer getWhitespaceTokenizer() {
  return WhitespaceTokenizer.INSTANCE;
 }

 public String[] tokenizeFileUsingLearnableTokenizer(String file) {
  String data = FileUtils.getFileDataAsString(file);
  return learnableTokenizer.tokenize(data);
 }

 public String[] tokenizeUsingLearnableTokenizer(String data) {
  return learnableTokenizer.tokenize(data);
 }

 public String[] tokenizeFileUsingWhiteSpaceTokenizer(String file) {
  String data = FileUtils.getFileDataAsString(file);
  return getWhitespaceTokenizer().tokenize(data);
 }

 public String[] tokenizeUsingWhiteSpaceTokenizer(String data) {
  return getWhitespaceTokenizer().tokenize(data);
 }
}


import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;

public class NameFinderUtil {

 public static String[] getNames(String tokenizerModel, String nameModel,
   String inputFile) throws FileNotFoundException {

  InputStream modelIn = new FileInputStream(nameModel);
  TokenNameFinderModel model = null;

  try {
   model = new TokenNameFinderModel(modelIn);
  } catch (IOException e) {
   e.printStackTrace();
   return null;
  } finally {
   if (modelIn != null) {
    try {
     modelIn.close();
    } catch (IOException e) {
    }
   }
  }

  NameFinderME nameFinder = new NameFinderME(model);

  TokenizerUtil tokenizerUtil = new TokenizerUtil(tokenizerModel);
  String[] tokens = tokenizerUtil
    .tokenizeUsingLearnableTokenizer(FileUtils
      .getFileDataAsString(inputFile));

  Span nameSpans[] = nameFinder.find(tokens);

  List<String> names = new ArrayList<>();

  for (Span span : nameSpans) {
   int start = span.getStart();
   int end = span.getEnd();

   String temp = "";
   for (int i = start; i < end; i++) {
    temp = temp + tokens[i];
   }

   names.add(temp);
  }
  String[] temp = new String[names.size()];

  return names.toArray(temp);
 }
}


import java.io.IOException;

public class Main {
 public static void main(String args[]) throws IOException {
  String tokenModelFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-token.bin";
  String modelFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/en-ner-person.bin";
  String inputFile = "/Users/harikrishna_gurram/study1/OpenNLP/apache-opennlp-1.6.0/bin/models/input.txt";

  String[] result = NameFinderUtil.getNames(tokenModelFile, modelFile, inputFile);
  
  for(String s: result)
   System.out.println(s);

 }
}


Output
Pierre
Vinken



Prevoius                                                 Next                                                 Home

No comments:

Post a Comment