skip to navigation
skip to content


Bindings to MorphoDiTa library


The ufal.morphodita is a Python binding to MorphoDiTa library <>.

The bindings is a straightforward conversion of the C++ bindings API. In Python 2, strings can be both unicode and UTF-8 encoded str, and the library always produces unicode. In Python 3, strings must be only str.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see <>.

Helper Structures

  typedef vector<int> Indices;

  typedef vector<string> Forms;

  struct TaggedForm {
    string form;
    string tag;
  typedef vector<TaggedForm> TaggedForms;

  struct TaggedLemma {
    string lemma;
    string tag;
  typedef vector<TaggedLemma> TaggedLemmas;
  typedef vector<TaggedLemmas> Analyses;

  struct TaggedLemmaForms {
    string lemma;
    TaggedForms forms;
  typedef vector<TaggedLemmaForms> TaggedLemmasForms;

  struct TokenRange {
    size_t start;
    size_t length;
  typedef vector<TokenRange> TokenRanges;

  struct DerivatedLemma {
    std::string lemma;
  typedef vector<DerivatedLemma> DerivatedLemmas;

Main Classes

  class Version {
    unsigned major;
    unsigned minor;
    unsigned patch;
    string prerelease;

    static Version current();

  class Tokenizer {
    virtual void setText(const char* text);
    virtual bool nextSentence(Forms* forms, TokenRanges* tokens);

    static Tokenizer* newVerticalTokenizer();
    static Tokenizer* newCzechTokenizer();
    static Tokenizer* newEnglishTokenizer();
    static Tokenizer* newGenericTokenizer();

  class Derivator {
    virtual bool parent(const char* lemma, DerivatedLemma& parent) const;
    virtual bool children(const char* lemma, DerivatedLemmas& children) const;

  class DerivationFormatter {
    virtual string formatDerivation(const char* lemma) const;

    static DerivationFormatter* newNoneDerivationFormatter();
    static DerivationFormatter* newRootDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newPathDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newTreeDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newDerivationFormatter(const char* name, const Derivator* derivator);

  class Morpho {
    static Morpho* load(const char* fname);

    enum { NO_GUESSER = 0, GUESSER = 1 };

    virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
    virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
    virtual string rawLemma(const char* lemma) const;
    virtual string lemmaId(const char* lemma) const;
    virtual string rawForm(const char* form) const;

    virtual Tokenizer* newTokenizer() const;

    virtual Derivator* getDerivator() const;

  class Tagger {
    static Tagger* load(const char* fname);

    virtual const Morpho* getMorpho() const;

    virtual void tag(const Forms& forms, TaggedLemmas& tags, int guesser = -1) const;

    virtual void tagAnalyzed(const Forms& forms, const Analyses& analyses, Indices& tags) const;

    Tokenizer* newTokenizer() const;

  class TagsetConverter {
    static TagsetConverter* newIdentityConverter();
    static TagsetConverter* newPdtToConll2009Converter();
    static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho);
    static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho);

    virtual void convert(TaggedLemma& lemma) const;
    virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
    virtual void convertGenerated(TaggedLemmasForms& forms) const;



Simple example performing morphological analysis and generation:

import sys

from ufal.morphodita import *

# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
  import codecs
  import locale
  encoding = locale.getpreferredencoding()
  sys.stdin = codecs.getreader(encoding)(sys.stdin)
  sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) < 2:
  sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0])

sys.stderr.write('Loading dictionary: ')
morpho = Morpho.load(sys.argv[1])
if not morpho:
  sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1])

lemmas = TaggedLemmas()
lemmas_forms = TaggedLemmasForms()
line = sys.stdin.readline()
while line:
  tokens = line.rstrip('\r\n').split('\t')
  if len(tokens) == 1: # analyze
    result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas)

    guesser = "Guesser " if result == morpho.GUESSER else ""
    for lemma in lemmas:
      sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag))
  elif len(tokens) == 2: # generate
    result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms)

    guesser = "Guesser " if result == morpho.GUESSER else ""
    for lemma_forms in lemmas_forms:
      sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma))
      for form in lemma_forms.forms:
        sys.stdout.write('  %s %s\n' % (form.form, form.tag))

  line = sys.stdin.readline()


Simple example performing tokenization and PoS tagging:

import sys

from ufal.morphodita import *

def encode_entities(text):
  return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')

# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
  import codecs
  import locale
  encoding = locale.getpreferredencoding()
  sys.stdin = codecs.getreader(encoding)(sys.stdin)
  sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) == 1:
  sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0])

sys.stderr.write('Loading tagger: ')
tagger = Tagger.load(sys.argv[1])
if not tagger:
  sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])

forms = Forms()
lemmas = TaggedLemmas()
tokens = TokenRanges()
tokenizer = tagger.newTokenizer()
if tokenizer is None:
  sys.stderr.write("No tokenizer is defined for the supplied model!")

not_eof = True
while not_eof:
  text = ''

  # Read block
  while True:
    line = sys.stdin.readline()
    not_eof = bool(line)
    if not not_eof: break
    line = line.rstrip('\r\n')
    text += line
    text += '\n';
    if not line: break

  # Tag
  t = 0
  while tokenizer.nextSentence(forms, tokens):
    tagger.tag(forms, lemmas)

    for i in range(len(lemmas)):
      lemma = lemmas[i]
      token = tokens[i]
      sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % (
        encode_entities(text[t : token.start]),
        "<sentence>" if i == 0 else "",
        encode_entities(text[token.start : token.start + token.length]),
        "</sentence>" if i + 1 == len(lemmas) else "",
      t = token.start + token.length
  sys.stdout.write(encode_entities(text[t : ]))


Milan Straka <>

Jana Strakov√° <>

File Type Py Version Uploaded on Size
ufal.morphodita- (md5) Source 2016-09-22 177KB