SphinxBase 0.6
src/libsphinxbase/lm/ngram_model.c
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model.c N-Gram language models.
00039  *
00040  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
00041  */
00042 
00043 #include <config.h>
00044 
00045 #include <string.h>
00046 #include <assert.h>
00047 
00048 #ifdef HAVE_ICONV
00049 #include <iconv.h>
00050 #endif 
00051 
00052 #include "sphinxbase/ngram_model.h"
00053 #include "sphinxbase/ckd_alloc.h"
00054 #include "sphinxbase/filename.h"
00055 #include "sphinxbase/pio.h"
00056 #include "sphinxbase/err.h"
00057 #include "sphinxbase/logmath.h"
00058 #include "sphinxbase/strfuncs.h"
00059 #include "sphinxbase/case.h"
00060 
00061 #include "ngram_model_internal.h"
00062 
00063 ngram_file_type_t
00064 ngram_file_name_to_type(const char *file_name)
00065 {
00066     const char *ext;
00067 
00068     ext = strrchr(file_name, '.');
00069     if (ext == NULL) {
00070         return NGRAM_INVALID;
00071     }
00072     if (0 == strcmp_nocase(ext, ".gz")) {
00073         while (--ext >= file_name) {
00074             if (*ext == '.') break;
00075         }
00076         if (ext < file_name) {
00077             return NGRAM_INVALID;
00078          }
00079      }
00080      else if (0 == strcmp_nocase(ext, ".bz2")) {
00081          while (--ext >= file_name) {
00082              if (*ext == '.') break;
00083          }
00084          if (ext < file_name) {
00085              return NGRAM_INVALID;
00086          }
00087      }
00088      /* We use strncmp because there might be a .gz on the end. */
00089      if (0 == strncmp_nocase(ext, ".ARPA", 5))
00090          return NGRAM_ARPA;
00091      if (0 == strncmp_nocase(ext, ".DMP", 4))
00092          return NGRAM_DMP;
00093      return NGRAM_INVALID;
00094  }
00095 
00096 ngram_file_type_t
00097 ngram_str_to_type(const char *str_name)
00098 {
00099     if (0 == strcmp_nocase(str_name, "arpa"))
00100         return NGRAM_ARPA;
00101     if (0 == strcmp_nocase(str_name, "dmp"))
00102         return NGRAM_DMP;
00103     return NGRAM_INVALID;
00104 }
00105 
00106 char const *
00107 ngram_type_to_str(int type)
00108 {
00109     switch (type) {
00110     case NGRAM_ARPA:
00111         return "arpa";
00112     case NGRAM_DMP:
00113         return "dmp";
00114     default:
00115         return NULL;
00116     }
00117 }
00118 
00119 
00120  ngram_model_t *
00121  ngram_model_read(cmd_ln_t *config,
00122                   const char *file_name,
00123                   ngram_file_type_t file_type,
00124                   logmath_t *lmath)
00125  {
00126      ngram_model_t *model = NULL;
00127 
00128      switch (file_type) {
00129      case NGRAM_AUTO: {
00130          if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00131              break;
00132          if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00133              break;
00134          return NULL;
00135      }
00136      case NGRAM_ARPA:
00137          model = ngram_model_arpa_read(config, file_name, lmath);
00138          break;
00139      case NGRAM_DMP:
00140          model = ngram_model_dmp_read(config, file_name, lmath);
00141          break;
00142      default:
00143          E_ERROR("language model file type not supported\n");
00144          return NULL;
00145      }
00146 
00147      /* Now set weights based on config if present. */
00148      if (config) {
00149          float32 lw = 1.0;
00150          float32 wip = 1.0;
00151          float32 uw = 1.0;
00152 
00153          if (cmd_ln_exists_r(config, "-lw"))
00154              lw = cmd_ln_float32_r(config, "-lw");
00155          if (cmd_ln_exists_r(config, "-wip"))
00156              wip = cmd_ln_float32_r(config, "-wip");
00157          if (cmd_ln_exists_r(config, "-uw"))
00158              uw = cmd_ln_float32_r(config, "-uw");
00159 
00160          ngram_model_apply_weights(model, lw, wip, uw);
00161      }
00162 
00163      return model;
00164  }
00165 
00166  int
00167  ngram_model_write(ngram_model_t *model, const char *file_name,
00168                    ngram_file_type_t file_type)
00169  {
00170      switch (file_type) {
00171      case NGRAM_AUTO: {
00172          file_type = ngram_file_name_to_type(file_name);
00173          /* Default to ARPA (catches .lm and other things) */
00174          if (file_type == NGRAM_INVALID)
00175              file_type = NGRAM_ARPA;
00176          return ngram_model_write(model, file_name, file_type);
00177      }
00178      case NGRAM_ARPA:
00179          return ngram_model_arpa_write(model, file_name);
00180      case NGRAM_DMP:
00181          return ngram_model_dmp_write(model, file_name);
00182      default:
00183          E_ERROR("language model file type not supported\n");
00184          return -1;
00185      }
00186      E_ERROR("language model file type not supported\n");
00187      return -1;
00188  }
00189 
00190  int32
00191  ngram_model_init(ngram_model_t *base,
00192                   ngram_funcs_t *funcs,
00193                   logmath_t *lmath,
00194                   int32 n, int32 n_unigram)
00195  {
00196      base->refcount = 1;
00197      base->funcs = funcs;
00198      base->n = n;
00199      /* If this was previously initialized... */
00200     if (base->n_counts == NULL)
00201         base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00202     /* Don't reset weights if logmath object hasn't changed. */
00203     if (base->lmath != lmath) {
00204         /* Set default values for weights. */
00205         base->lw = 1.0;
00206         base->log_wip = 0; /* i.e. 1.0 */
00207         base->log_uw = 0;  /* i.e. 1.0 */
00208         base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00209         base->log_uniform_weight = logmath_get_zero(lmath);
00210         base->log_zero = logmath_get_zero(lmath);
00211         base->lmath = lmath;
00212     }
00213     /* Allocate or reallocate space for word strings. */
00214     if (base->word_str) {
00215         /* Free all previous word strings if they were allocated. */
00216         if (base->writable) {
00217             int32 i;
00218             for (i = 0; i < base->n_words; ++i) {
00219                 ckd_free(base->word_str[i]);
00220                 base->word_str[i] = NULL;
00221             }
00222         }
00223         base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00224     }
00225     else
00226         base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00227     /* NOTE: They are no longer case-insensitive since we are allowing
00228      * other encodings for word strings.  Beware. */
00229     if (base->wid)
00230         hash_table_empty(base->wid);
00231     else
00232         base->wid = hash_table_new(n_unigram, FALSE);
00233     base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
00234 
00235     return 0;
00236 }
00237 
00238 ngram_model_t *
00239 ngram_model_retain(ngram_model_t *model)
00240 {
00241     ++model->refcount;
00242     return model;
00243 }
00244 
00245 
00246 void
00247 ngram_model_flush(ngram_model_t *model)
00248 {
00249     if (model->funcs && model->funcs->flush)
00250         (*model->funcs->flush)(model);
00251 }
00252 
00253 int
00254 ngram_model_free(ngram_model_t *model)
00255 {
00256     int i;
00257 
00258     if (model == NULL)
00259         return 0;
00260     if (--model->refcount > 0)
00261         return model->refcount;
00262     if (model->funcs && model->funcs->free)
00263         (*model->funcs->free)(model);
00264     if (model->writable) {
00265         /* Free all words. */
00266         for (i = 0; i < model->n_words; ++i) {
00267             ckd_free(model->word_str[i]);
00268         }
00269     }
00270     else {
00271         /* Free all class words. */
00272         for (i = 0; i < model->n_classes; ++i) {
00273             ngram_class_t *lmclass;
00274             int32 j;
00275 
00276             lmclass = model->classes[i];
00277             for (j = 0; j < lmclass->n_words; ++j) {
00278                 ckd_free(model->word_str[lmclass->start_wid + j]);
00279             }
00280             for (j = 0; j < lmclass->n_hash; ++j) {
00281                 if (lmclass->nword_hash[j].wid != -1) {
00282                     ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00283                 }
00284             }
00285         }
00286     }
00287     for (i = 0; i < model->n_classes; ++i) {
00288         ngram_class_free(model->classes[i]);
00289     }
00290     ckd_free(model->classes);
00291     hash_table_free(model->wid);
00292     ckd_free(model->word_str);
00293     ckd_free(model->n_counts);
00294     ckd_free(model);
00295     return 0;
00296 }
00297 
00298 int
00299 ngram_model_casefold(ngram_model_t *model, int kase)
00300 {
00301     int writable, i;
00302     hash_table_t *new_wid;
00303 
00304     /* Were word strings already allocated? */
00305     writable = model->writable;
00306     /* Either way, we are going to allocate some word strings. */
00307     model->writable = TRUE;
00308 
00309     /* And, don't forget, we need to rebuild the word to unigram ID
00310      * mapping. */
00311     new_wid = hash_table_new(model->n_words, FALSE);
00312     for (i = 0; i < model->n_words; ++i) {
00313         char *outstr;
00314         if (writable) {
00315             outstr = model->word_str[i];
00316         }
00317         else {
00318             outstr = ckd_salloc(model->word_str[i]);
00319         }
00320         /* Don't case-fold <tags> or [classes] */
00321         if (outstr[0] == '<' || outstr[0] == '[') {
00322         }
00323         else {
00324             switch (kase) {
00325             case NGRAM_UPPER:
00326                 ucase(outstr);
00327                 break;
00328             case NGRAM_LOWER:
00329                 lcase(outstr);
00330                 break;
00331             default:
00332                 ;
00333             }
00334         }
00335         model->word_str[i] = outstr;
00336 
00337         /* Now update the hash table.  We might have terrible
00338          * collisions here, so warn about them. */
00339         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00340             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00341                    model->word_str[i]);
00342         }
00343     }
00344     /* Swap out the hash table. */
00345     hash_table_free(model->wid);
00346     model->wid = new_wid;
00347     return 0;
00348 }
00349 
00350 #ifdef HAVE_ICONV
00351 int
00352 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00353 {
00354     iconv_t ic;
00355     char *outbuf;
00356     size_t maxlen;
00357     int i, writable;
00358     hash_table_t *new_wid;
00359 
00360     /* FIXME: Need to do a special case thing for the GB-HEX encoding
00361      * used in Sphinx3 Mandarin models. */
00362     if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00363         E_ERROR_SYSTEM("iconv_open() failed");
00364         return -1;
00365     }
00366     /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
00367      * unlike wcstombs(3). So we have to either call it over and over
00368      * again until our buffer is big enough, or call it with a huge
00369      * buffer and then copy things back to the output.  We will use a
00370      * mix of these two approaches here.  We'll keep a single big
00371      * buffer around, and expand it as necessary.
00372      */
00373     maxlen = 0;
00374     for (i = 0; i < model->n_words; ++i) {
00375         if (strlen(model->word_str[i]) > maxlen)
00376             maxlen = strlen(model->word_str[i]);
00377     }
00378     /* Were word strings already allocated? */
00379     writable = model->writable;
00380     /* Either way, we are going to allocate some word strings. */
00381     model->writable = TRUE;
00382     /* Really should be big enough except for pathological cases. */
00383     maxlen = maxlen * sizeof(int) + 15;
00384     outbuf = ckd_calloc(maxlen, 1);
00385     /* And, don't forget, we need to rebuild the word to unigram ID
00386      * mapping. */
00387     new_wid = hash_table_new(model->n_words, FALSE);
00388     for (i = 0; i < model->n_words; ++i) {
00389         ICONV_CONST char *in;
00390         char *out;
00391         size_t inleft, outleft, result;
00392 
00393     start_conversion:
00394         in = (ICONV_CONST char *)model->word_str[i];
00395         /* Yes, this assumes that we don't have any NUL bytes. */
00396         inleft = strlen(in);
00397         out = outbuf;
00398         outleft = maxlen;
00399 
00400         while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00401             if (errno != E2BIG) {
00402                 /* FIXME: if we already converted any words, then they
00403                  * are going to be in an inconsistent state. */
00404                 E_ERROR_SYSTEM("iconv() failed");
00405                 ckd_free(outbuf);
00406                 hash_table_free(new_wid);
00407                 return -1;
00408             }
00409             /* Reset the internal state of conversion. */
00410             iconv(ic, NULL, NULL, NULL, NULL);
00411             /* Make everything bigger. */
00412             maxlen *= 2;
00413             out = outbuf = ckd_realloc(outbuf, maxlen);
00414             /* Reset the input pointers. */
00415             in = (ICONV_CONST char *)model->word_str[i];
00416             inleft = strlen(in);
00417         }
00418 
00419         /* Now flush a shift-out sequence, if any. */
00420         if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00421             if (errno != E2BIG) {
00422                 /* FIXME: if we already converted any words, then they
00423                  * are going to be in an inconsistent state. */
00424                 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00425                 ckd_free(outbuf);
00426                 hash_table_free(new_wid);
00427                 return -1;
00428             }
00429             /* Reset the internal state of conversion. */
00430             iconv(ic, NULL, NULL, NULL, NULL);
00431             /* Make everything bigger. */
00432             maxlen *= 2;
00433             outbuf = ckd_realloc(outbuf, maxlen);
00434             /* Be very evil. */
00435             goto start_conversion;
00436         }
00437 
00438         result = maxlen - outleft;
00439         /* Okay, that was hard, now let's go shopping. */
00440         if (writable) {
00441             /* Grow or shrink the output string as necessary. */
00442             model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00443             model->word_str[i][result] = '\0';
00444         }
00445         else {
00446             /* It actually was not allocated previously, so do that now. */
00447             model->word_str[i] = ckd_calloc(result + 1, 1);
00448         }
00449         /* Copy the new thing in. */
00450         memcpy(model->word_str[i], outbuf, result);
00451 
00452         /* Now update the hash table.  We might have terrible
00453          * collisions if a non-reversible conversion was requested.,
00454          * so warn about them. */
00455         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00456             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00457                    model->word_str[i]);
00458         }
00459     }
00460     ckd_free(outbuf);
00461     iconv_close(ic);
00462     /* Swap out the hash table. */
00463     hash_table_free(model->wid);
00464     model->wid = new_wid;
00465 
00466     return 0;
00467 }
00468 #else /* !HAVE_ICONV */
00469 int
00470 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00471 {
00472     return -1;
00473 }
00474 #endif /* !HAVE_ICONV */
00475 
00476 int
00477 ngram_model_apply_weights(ngram_model_t *model,
00478                           float32 lw, float32 wip, float32 uw)
00479 {
00480     return (*model->funcs->apply_weights)(model, lw, wip, uw);
00481 }
00482 
00483 float32
00484 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00485                         int32 *out_log_uw)
00486 {
00487     if (out_log_wip) *out_log_wip = model->log_wip;
00488     if (out_log_uw) *out_log_uw = model->log_uw;
00489     return model->lw;
00490 }
00491 
00492 
00493 int32
00494 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00495                int32 n_hist, int32 *n_used)
00496 {
00497     int32 score, class_weight = 0;
00498     int i;
00499 
00500     /* Closed vocabulary, OOV word probability is zero */
00501     if (wid == NGRAM_INVALID_WID)
00502         return model->log_zero;
00503 
00504     /* "Declassify" wid and history */
00505     if (NGRAM_IS_CLASSWID(wid)) {
00506         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00507 
00508         class_weight = ngram_class_prob(lmclass, wid);
00509         if (class_weight == 1) /* Meaning, not found in class. */
00510             return model->log_zero;
00511         wid = lmclass->tag_wid;
00512     }
00513     for (i = 0; i < n_hist; ++i) {
00514         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00515             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00516     }
00517     score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00518 
00519     /* Multiply by unigram in-class weight. */
00520     return score + class_weight;
00521 }
00522 
00523 int32
00524 ngram_score(ngram_model_t *model, const char *word, ...)
00525 {
00526     va_list history;
00527     const char *hword;
00528     int32 *histid;
00529     int32 n_hist;
00530     int32 n_used;
00531     int32 prob;
00532 
00533     va_start(history, word);
00534     n_hist = 0;
00535     while ((hword = va_arg(history, const char *)) != NULL)
00536         ++n_hist;
00537     va_end(history);
00538 
00539     histid = ckd_calloc(n_hist, sizeof(*histid));
00540     va_start(history, word);
00541     n_hist = 0;
00542     while ((hword = va_arg(history, const char *)) != NULL) {
00543         histid[n_hist] = ngram_wid(model, hword);
00544         ++n_hist;
00545     }
00546     va_end(history);
00547 
00548     prob = ngram_ng_score(model, ngram_wid(model, word),
00549                           histid, n_hist, &n_used);
00550     ckd_free(histid);
00551     return prob;
00552 }
00553 
00554 int32
00555 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00556 {
00557     int32 hist[2];
00558     hist[0] = w2;
00559     hist[1] = w1;
00560     return ngram_ng_score(model, w3, hist, 2, n_used);
00561 }
00562 
00563 int32
00564 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00565 {
00566     return ngram_ng_score(model, w2, &w1, 1, n_used);
00567 }
00568 
00569 int32
00570 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00571               int32 n_hist, int32 *n_used)
00572 {
00573     int32 prob, class_weight = 0;
00574     int i;
00575 
00576     /* Closed vocabulary, OOV word probability is zero */
00577     if (wid == NGRAM_INVALID_WID)
00578         return model->log_zero;
00579 
00580     /* "Declassify" wid and history */
00581     if (NGRAM_IS_CLASSWID(wid)) {
00582         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00583 
00584         class_weight = ngram_class_prob(lmclass, wid);
00585         if (class_weight == 1) /* Meaning, not found in class. */
00586             return class_weight;
00587         wid = lmclass->tag_wid;
00588     }
00589     for (i = 0; i < n_hist; ++i) {
00590         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00591             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00592     }
00593     prob = (*model->funcs->raw_score)(model, wid, history,
00594                                       n_hist, n_used);
00595     /* Multiply by unigram in-class weight. */
00596     return prob + class_weight;
00597 }
00598 
00599 int32
00600 ngram_prob(ngram_model_t *model, const char *word, ...)
00601 {
00602     va_list history;
00603     const char *hword;
00604     int32 *histid;
00605     int32 n_hist;
00606     int32 n_used;
00607     int32 prob;
00608 
00609     va_start(history, word);
00610     n_hist = 0;
00611     while ((hword = va_arg(history, const char *)) != NULL)
00612         ++n_hist;
00613     va_end(history);
00614 
00615     histid = ckd_calloc(n_hist, sizeof(*histid));
00616     va_start(history, word);
00617     n_hist = 0;
00618     while ((hword = va_arg(history, const char *)) != NULL) {
00619         histid[n_hist] = ngram_wid(model, hword);
00620         ++n_hist;
00621     }
00622     va_end(history);
00623 
00624     prob = ngram_ng_prob(model, ngram_wid(model, word),
00625                          histid, n_hist, &n_used);
00626     ckd_free(histid);
00627     return prob;
00628 }
00629 
00630 int32
00631 ngram_score_to_prob(ngram_model_t *base, int32 score)
00632 {
00633     int32 prob;
00634 
00635     /* Undo insertion penalty. */
00636     prob = score - base->log_wip;
00637     /* Undo language weight. */
00638     prob = (int32)(prob / base->lw);
00639 
00640     return prob;
00641 }
00642 
00643 int32
00644 ngram_unknown_wid(ngram_model_t *model)
00645 {
00646     int32 val;
00647 
00648     /* FIXME: This could be memoized for speed if necessary. */
00649     /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
00650     if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00651         return NGRAM_INVALID_WID;
00652     else
00653         return val;
00654 }
00655 
00656 int32
00657 ngram_zero(ngram_model_t *model)
00658 {
00659     return model->log_zero;
00660 }
00661 
00662 int32
00663 ngram_model_get_size(ngram_model_t *model)
00664 {
00665   if (model != NULL)
00666     return model->n;
00667   return 0;
00668 }
00669 
00670 int32 const *
00671 ngram_model_get_counts(ngram_model_t *model)
00672 {
00673   if (model != NULL)
00674     return model->n_counts;
00675   return NULL;
00676 }
00677 
00678 void
00679 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00680                 int m, int successor)
00681 {
00682     itor->model = model;
00683     itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
00684     itor->m = m;
00685     itor->successor = successor;
00686 }
00687 
00688 ngram_iter_t *
00689 ngram_model_mgrams(ngram_model_t *model, int m)
00690 {
00691     ngram_iter_t *itor;
00692     /* The fact that m=n-1 is not exactly obvious.  Prevent accidents. */
00693     if (m >= model->n)
00694         return NULL;
00695     if (model->funcs->mgrams == NULL)
00696         return NULL;
00697     itor = (*model->funcs->mgrams)(model, m);
00698     return itor;
00699 }
00700 
00701 ngram_iter_t *
00702 ngram_iter(ngram_model_t *model, const char *word, ...)
00703 {
00704     va_list history;
00705     const char *hword;
00706     int32 *histid;
00707     int32 n_hist;
00708     ngram_iter_t *itor;
00709 
00710     va_start(history, word);
00711     n_hist = 0;
00712     while ((hword = va_arg(history, const char *)) != NULL)
00713         ++n_hist;
00714     va_end(history);
00715 
00716     histid = ckd_calloc(n_hist, sizeof(*histid));
00717     va_start(history, word);
00718     n_hist = 0;
00719     while ((hword = va_arg(history, const char *)) != NULL) {
00720         histid[n_hist] = ngram_wid(model, hword);
00721         ++n_hist;
00722     }
00723     va_end(history);
00724 
00725     itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
00726     ckd_free(histid);
00727     return itor;
00728 }
00729 
00730 ngram_iter_t *
00731 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
00732 {
00733     if (n_hist >= model->n)
00734         return NULL;
00735     if (model->funcs->iter == NULL)
00736         return NULL;
00737     return (*model->funcs->iter)(model, wid, history, n_hist);
00738 }
00739 
00740 ngram_iter_t *
00741 ngram_iter_successors(ngram_iter_t *itor)
00742 {
00743     /* Stop when we are at the highest order N-Gram. */
00744     if (itor->m == itor->model->n - 1)
00745         return NULL;
00746     return (*itor->model->funcs->successors)(itor);
00747 }
00748 
00749 int32 const *
00750 ngram_iter_get(ngram_iter_t *itor,
00751                int32 *out_score,
00752                int32 *out_bowt)
00753 {
00754     return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
00755 }
00756 
00757 ngram_iter_t *
00758 ngram_iter_next(ngram_iter_t *itor)
00759 {
00760     return (*itor->model->funcs->iter_next)(itor);
00761 }
00762 
00763 void
00764 ngram_iter_free(ngram_iter_t *itor)
00765 {
00766     ckd_free(itor->wids);
00767     (*itor->model->funcs->iter_free)(itor);
00768 }
00769 
00770 int32
00771 ngram_wid(ngram_model_t *model, const char *word)
00772 {
00773     int32 val;
00774 
00775     if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00776         return ngram_unknown_wid(model);
00777     else
00778         return val;
00779 }
00780 
00781 const char *
00782 ngram_word(ngram_model_t *model, int32 wid)
00783 {
00784     /* Remove any class tag */
00785     wid = NGRAM_BASEWID(wid);
00786     if (wid >= model->n_words)
00787         return NULL;
00788     return model->word_str[wid];
00789 }
00790 
00794 int32
00795 ngram_add_word_internal(ngram_model_t *model,
00796                         const char *word,
00797                         int32 classid)
00798 {
00799     void *dummy;
00800     int32 wid;
00801 
00802     /* Take the next available word ID */
00803     wid = model->n_words;
00804     if (classid >= 0) {
00805         wid = NGRAM_CLASSWID(wid, classid);
00806     }
00807     /* Check for hash collisions. */
00808     if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00809         E_ERROR("Duplicate definition of word %s\n", word);
00810         return NGRAM_INVALID_WID;
00811     }
00812     /* Reallocate word_str if necessary. */
00813     if (model->n_words >= model->n_1g_alloc) {
00814         model->n_1g_alloc += UG_ALLOC_STEP;
00815         model->word_str = ckd_realloc(model->word_str,
00816                                       sizeof(*model->word_str) * model->n_1g_alloc);
00817     }
00818     /* Add the word string in the appropriate manner. */
00819     /* Class words are always dynamically allocated. */
00820     model->word_str[model->n_words] = ckd_salloc(word);
00821     /* Now enter it into the hash table. */
00822     if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00823         E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00824                 model->word_str[model->n_words], (void *)(long)(wid));
00825     }
00826     /* Increment number of words. */
00827     ++model->n_words;
00828     return wid;
00829 }
00830 
00831 int32
00832 ngram_model_add_word(ngram_model_t *model,
00833                      const char *word, float32 weight)
00834 {
00835     int32 wid, prob = model->log_zero;
00836 
00837     wid = ngram_add_word_internal(model, word, -1);
00838     if (wid == NGRAM_INVALID_WID)
00839         return wid;
00840 
00841     /* Do what needs to be done to add the word to the unigram. */
00842     if (model->funcs && model->funcs->add_ug)
00843         prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00844     if (prob == 0) {
00845         if (model->writable)
00846             ckd_free(model->word_str[wid]);
00847         return -1;
00848     }
00849     return wid;
00850 }
00851 
00852 ngram_class_t *
00853 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00854 {
00855     ngram_class_t *lmclass;
00856     gnode_t *gn;
00857     float32 tprob;
00858     int i;
00859 
00860     lmclass = ckd_calloc(1, sizeof(*lmclass));
00861     lmclass->tag_wid = tag_wid;
00862     /* wid_base is the wid (minus class tag) of the first word in the list. */
00863     lmclass->start_wid = start_wid;
00864     lmclass->n_words = glist_count(classwords);
00865     lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00866     lmclass->nword_hash = NULL;
00867     lmclass->n_hash = 0;
00868     tprob = 0.0;
00869     for (gn = classwords; gn; gn = gnode_next(gn)) {
00870         tprob += gnode_float32(gn);
00871     }
00872     if (tprob > 1.1 || tprob < 0.9) {
00873         E_WARN("Total class probability is %f, will normalize\n", tprob);
00874         for (gn = classwords; gn; gn = gnode_next(gn)) {
00875             gn->data.fl /= tprob;
00876         }
00877     }
00878     for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00879         lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00880     }
00881 
00882     return lmclass;
00883 }
00884 
00885 int32
00886 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00887 {
00888     int32 hash;
00889 
00890     if (lmclass->nword_hash == NULL) {
00891         /* Initialize everything in it to -1 */
00892         lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00893         memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00894         lmclass->n_hash = NGRAM_HASH_SIZE;
00895         lmclass->n_hash_inuse = 0;
00896     }
00897     /* Stupidest possible hash function.  This will work pretty well
00898      * when this function is called repeatedly with contiguous word
00899      * IDs, though... */
00900     hash = wid & (lmclass->n_hash - 1);
00901     if (lmclass->nword_hash[hash].wid == -1) {
00902         /* Good, no collision. */
00903         lmclass->nword_hash[hash].wid = wid;
00904         lmclass->nword_hash[hash].prob1 = lweight;
00905         ++lmclass->n_hash_inuse;
00906         return hash;
00907     }
00908     else {
00909         int32 next; 
00910         /* Collision... Find the end of the hash chain. */
00911         while (lmclass->nword_hash[hash].next != -1)
00912             hash = lmclass->nword_hash[hash].next;
00913         assert(hash != -1);
00914         /* Does we has any more bukkit? */
00915         if (lmclass->n_hash_inuse == lmclass->n_hash) {
00916             /* Oh noes!  Ok, we makes more. */
00917             lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 
00918                                               lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00919             memset(lmclass->nword_hash + lmclass->n_hash,
00920                    0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00921             /* Just use the next allocated one (easy) */
00922             next = lmclass->n_hash;
00923             lmclass->n_hash *= 2;
00924         }
00925         else {
00926             /* Look for any available bucket.  We hope this doesn't happen. */
00927             for (next = 0; next < lmclass->n_hash; ++next)
00928                 if (lmclass->nword_hash[next].wid == -1)
00929                     break;
00930             /* This should absolutely not happen. */
00931             assert(next != lmclass->n_hash);
00932         }
00933         lmclass->nword_hash[next].wid = wid;
00934         lmclass->nword_hash[next].prob1 = lweight;
00935         lmclass->nword_hash[hash].next = next;
00936         ++lmclass->n_hash_inuse;
00937         return next;
00938     }
00939 }
00940 
00941 void
00942 ngram_class_free(ngram_class_t *lmclass)
00943 {
00944     ckd_free(lmclass->nword_hash);
00945     ckd_free(lmclass->prob1);
00946     ckd_free(lmclass);
00947 }
00948 
00949 int32
00950 ngram_model_add_class_word(ngram_model_t *model,
00951                            const char *classname,
00952                            const char *word,
00953                            float32 weight)
00954 {
00955     ngram_class_t *lmclass;
00956     int32 classid, tag_wid, wid, i, scale;
00957     float32 fprob;
00958 
00959     /* Find the class corresponding to classname.  Linear search
00960      * probably okay here since there won't be very many classes, and
00961      * this doesn't have to be fast. */
00962     tag_wid = ngram_wid(model, classname);
00963     if (tag_wid == NGRAM_INVALID_WID) {
00964         E_ERROR("No such word or class tag: %s\n", classname);
00965         return tag_wid;
00966     }
00967     for (classid = 0; classid < model->n_classes; ++classid) {
00968         if (model->classes[classid]->tag_wid == tag_wid)
00969             break;
00970     }
00971     /* Hmm, no such class.  It's probably not a good idea to create one. */
00972     if (classid == model->n_classes) {
00973         E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00974         return NGRAM_INVALID_WID;
00975     }
00976     lmclass = model->classes[classid];
00977 
00978     /* Add this word to the model's set of words. */
00979     wid = ngram_add_word_internal(model, word, classid);
00980     if (wid == NGRAM_INVALID_WID)
00981         return wid;
00982 
00983     /* This is the fixed probability of the new word. */
00984     fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00985     /* Now normalize everything else to fit it in.  This is
00986      * accomplished by simply scaling all the other probabilities
00987      * by (1-fprob). */
00988     scale = logmath_log(model->lmath, 1.0 - fprob);
00989     for (i = 0; i < lmclass->n_words; ++i)
00990         lmclass->prob1[i] += scale;
00991     for (i = 0; i < lmclass->n_hash; ++i)
00992         if (lmclass->nword_hash[i].wid != -1)
00993             lmclass->nword_hash[i].prob1 += scale;
00994 
00995     /* Now add it to the class hash table. */
00996     return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00997 }
00998 
00999 int32
01000 ngram_model_add_class(ngram_model_t *model,
01001                       const char *classname,
01002                       float32 classweight,
01003                       char **words,
01004                       const float32 *weights,
01005                       int32 n_words)
01006 {
01007     ngram_class_t *lmclass;
01008     glist_t classwords = NULL;
01009     int32 i, start_wid = -1;
01010     int32 classid, tag_wid;
01011 
01012     /* Check if classname already exists in model.  If not, add it.*/
01013     if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
01014         tag_wid = ngram_model_add_word(model, classname, classweight);
01015         if (tag_wid == NGRAM_INVALID_WID)
01016             return -1;
01017     }
01018 
01019     if (model->n_classes == 128) {
01020         E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
01021         return -1;
01022     }
01023     classid = model->n_classes;
01024     for (i = 0; i < n_words; ++i) {
01025         int32 wid;
01026 
01027         wid = ngram_add_word_internal(model, words[i], classid);
01028         if (wid == NGRAM_INVALID_WID)
01029             return -1;
01030         if (start_wid == -1)
01031             start_wid = NGRAM_BASEWID(wid);
01032         classwords = glist_add_float32(classwords, weights[i]);
01033     }
01034     classwords = glist_reverse(classwords);
01035     lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
01036     glist_free(classwords);
01037     if (lmclass == NULL)
01038         return -1;
01039 
01040     ++model->n_classes;
01041     if (model->classes == NULL)
01042         model->classes = ckd_calloc(1, sizeof(*model->classes));
01043     else
01044         model->classes = ckd_realloc(model->classes,
01045                                      model->n_classes * sizeof(*model->classes));
01046     model->classes[classid] = lmclass;
01047     return classid;
01048 }
01049 
01050 int32
01051 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
01052 {
01053     int32 base_wid = NGRAM_BASEWID(wid);
01054 
01055     if (base_wid < lmclass->start_wid
01056         || base_wid > lmclass->start_wid + lmclass->n_words) {
01057         int32 hash;
01058 
01059         /* Look it up in the hash table. */
01060         hash = wid & (lmclass->n_hash - 1);
01061         while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
01062             hash = lmclass->nword_hash[hash].next;
01063         if (hash == -1)
01064             return 1;
01065         return lmclass->nword_hash[hash].prob1;
01066     }
01067     else {
01068         return lmclass->prob1[base_wid - lmclass->start_wid];
01069     }
01070 }
01071 
01072 int32
01073 read_classdef_file(hash_table_t *classes, const char *file_name)
01074 {
01075     FILE *fp;
01076     int32 is_pipe;
01077     int inclass;  
01078     int32 rv = -1;
01079     gnode_t *gn;
01080     glist_t classwords = NULL;
01081     glist_t classprobs = NULL;
01082     char *classname = NULL;
01083 
01084     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
01085         E_ERROR("File %s not found\n", file_name);
01086         return -1;
01087     }
01088 
01089     inclass = FALSE;
01090     while (!feof(fp)) {
01091         char line[512];
01092         char *wptr[2];
01093         int n_words;
01094 
01095         if (fgets(line, sizeof(line), fp) == NULL)
01096             break;
01097 
01098         n_words = str2words(line, wptr, 2);
01099         if (n_words <= 0)
01100             continue;
01101 
01102         if (inclass) {
01103             /* Look for an end of class marker. */
01104             if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
01105                 classdef_t *classdef;
01106                 gnode_t *word, *weight;
01107                 int32 i;
01108 
01109                 if (classname == NULL || 0 != strcmp(wptr[1], classname))
01110                     goto error_out;
01111                 inclass = FALSE;
01112 
01113                 /* Construct a class from the list of words collected. */
01114                 classdef = ckd_calloc(1, sizeof(*classdef));
01115                 classwords = glist_reverse(classwords);
01116                 classprobs = glist_reverse(classprobs);
01117                 classdef->n_words = glist_count(classwords);
01118                 classdef->words = ckd_calloc(classdef->n_words,
01119                                              sizeof(*classdef->words));
01120                 classdef->weights = ckd_calloc(classdef->n_words,
01121                                                sizeof(*classdef->weights));
01122                 word = classwords;
01123                 weight = classprobs;
01124                 for (i = 0; i < classdef->n_words; ++i) {
01125                     classdef->words[i] = gnode_ptr(word);
01126                     classdef->weights[i] = gnode_float32(weight);
01127                     word = gnode_next(word);
01128                     weight = gnode_next(weight);
01129                 }
01130                 
01131                 /* Add this class to the hash table. */
01132                 if (hash_table_enter(classes, classname, classdef) != classdef) {
01133                     classdef_free(classdef);
01134                     goto error_out;
01135                 }
01136 
01137                 /* Reset everything. */
01138                 glist_free(classwords);
01139                 glist_free(classprobs);
01140                 classwords = NULL;
01141                 classprobs = NULL;
01142                 classname = NULL;
01143             }
01144             else {
01145                 float32 fprob;
01146 
01147                 if (n_words == 2)
01148                     fprob = (float32)atof_c(wptr[1]);
01149                 else
01150                     fprob = 1.0f;
01151                 /* Add it to the list of words for this class. */
01152                 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
01153                 classprobs = glist_add_float32(classprobs, fprob);
01154             }
01155         }
01156         else {
01157             /* Start a new LM class if the LMCLASS marker is seen */
01158             if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
01159                 if (inclass)
01160                     goto error_out;
01161                 inclass = TRUE;
01162                 classname = ckd_salloc(wptr[1]);
01163             }
01164             /* Otherwise, just ignore whatever junk we got */
01165         }
01166     }
01167     rv = 0; /* Success. */
01168 
01169 error_out:
01170     /* Free all the stuff we might have allocated. */
01171     fclose_comp(fp, is_pipe);
01172     for (gn = classwords; gn; gn = gnode_next(gn))
01173         ckd_free(gnode_ptr(gn));
01174     glist_free(classwords);
01175     glist_free(classprobs);
01176     ckd_free(classname);
01177 
01178     return rv;
01179 }
01180 
01181 void
01182 classdef_free(classdef_t *classdef)
01183 {
01184     int32 i;
01185     for (i = 0; i < classdef->n_words; ++i)
01186         ckd_free(classdef->words[i]);
01187     ckd_free(classdef->words);
01188     ckd_free(classdef->weights);
01189     ckd_free(classdef);
01190 }
01191 
01192 
01193 int32
01194 ngram_model_read_classdef(ngram_model_t *model,
01195                           const char *file_name)
01196 {
01197     hash_table_t *classes;
01198     glist_t hl = NULL;
01199     gnode_t *gn;
01200     int32 rv = -1;
01201 
01202     classes = hash_table_new(0, FALSE);
01203     if (read_classdef_file(classes, file_name) < 0) {
01204         hash_table_free(classes);
01205         return -1;
01206     }
01207     
01208     /* Create a new class in the language model for each classdef. */
01209     hl = hash_table_tolist(classes, NULL);
01210     for (gn = hl; gn; gn = gnode_next(gn)) {
01211         hash_entry_t *he = gnode_ptr(gn);
01212         classdef_t *classdef = he->val;
01213 
01214         if (ngram_model_add_class(model, he->key, 1.0,
01215                                   classdef->words,
01216                                   classdef->weights,
01217                                   classdef->n_words) < 0)
01218             goto error_out;
01219     }
01220     rv = 0;
01221 
01222 error_out:
01223     for (gn = hl; gn; gn = gnode_next(gn)) {
01224         hash_entry_t *he = gnode_ptr(gn);
01225         ckd_free((char *)he->key);
01226         classdef_free(he->val);
01227     }
01228     glist_free(hl);
01229     hash_table_free(classes);
01230     return rv;
01231 }