lm.h
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * lm.h - Disk/memory based word-trigram backoff LM
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1997 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log: lm.h,v $
49  * Revision 1.16 2006/03/02 22:10:36 arthchan2003
50  * Add *g_write into the code.
51  *
52  * Revision 1.15 2006/02/28 22:26:51 egouvea
53  * Moved definition of lm_wid() outside of the #if 0/#endif block, so
54  * it's declared.
55  *
56  * Revision 1.14 2006/02/24 13:38:08 arthchan2003
57  * Added lm_read, it is a simple version of lm_read_advance.
58  *
59  * Revision 1.13 2006/02/23 04:16:29 arthchan2003
60  * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
61  * Splited the original lm.c into five parts,
62  * a, lm.c - a controller of other subroutines.
63  * b, lm_3g.c - implement TXT-based lm operations
64  * c, lm_3g_dmp.c - implement DMP-based lm operations
65  * d, lm_attfsm.c - implement FSM-based lm operations
66  * e, lmset.c - implement sets of lm.
67  *
68  * Revision 1.12.4.3 2006/01/16 19:56:37 arthchan2003
69  * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format. This code used Yannick Esteve's and LIUM code.
70  *
71  * Revision 1.12.4.2 2005/11/17 06:15:22 arthchan2003
72  * Added input-encoding and output-encoding into the lm structure.
73  *
74  * Revision 1.12.4.1 2005/07/13 01:46:22 arthchan2003
75  * 1, Fixed dox-doc, 2, Added more documentation for major functions such as lm_read and lm_write.
76  *
77  * Revision 1.12 2005/06/21 22:24:02 arthchan2003
78  * Log. In this change, I introduced a new interface for lm ,which is
79  * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
80  * same structure and handle LM initialization (lm_init) switching,
81  * (lmset_curlm_widx), delete LM (lmset_delete_lm). The internal
82  * structure is called lmarray and is an array of pointers of lm. The
83  * current lm is always maintained and pointed by a pointer called cur_lm
84  * . This substantially clarify the structure of the code. At this
85  * check-in, not every core function of lmset is completed.
86  * e.g. lmset_add_lm because that required testing of several LM reading
87  * routines and could be quite time-consuming.
88  *
89  * Log. Another notable change is the fact dict2lmwid map is started to
90  * be part of the LM. The reason of this is clearly described inside the
91  * code. Don't want to repeat here.
92  *
93  * Log. The new interface has been already used broadly in both Sphinx
94  * 3.0 and sphinx 3.x family of tools.
95  *
96  * Revision 1.5 2005/06/18 03:22:28 archan
97  * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
98  *
99  * Revision 1.4 2005/06/17 23:44:40 archan
100  * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend. 2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
101  *
102  * Revision 1.3 2005/06/13 04:02:59 archan
103  * Fixed most doxygen-style documentation under libs3decoder.
104  *
105  * Revision 1.2 2005/05/10 21:21:54 archan
106  * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
107  *
108  * Revision 1.1 2005/05/04 06:08:07 archan
109  * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
110  *
111  * Revision 1.6 2005/05/04 04:02:24 archan
112  * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search. Not yet tested. Just want to keep up my own momentum.
113  *
114  * Revision 1.5 2005/04/21 23:50:26 archan
115  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used.
116  *
117  * Revision 1.4 2005/04/20 03:37:59 archan
118  * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
119  *
120  * Revision 1.3 2005/03/30 01:22:47 archan
121  * Fixed mistakes in last updates. Add
122  *
123  *
124  * 20.Apr.2001 RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
125  * Adding lm_free() to free allocated memory
126  *
127  * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
128  * Added lm_t.access_type; made lm_wid externally visible.
129  *
130  * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
131  * Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
132  *
133  * 13-Feb-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
134  * Created from original S3 version.
135  */
136 
137 
138 #ifndef _S3_LM_H_
139 #define _S3_LM_H_
140 
141 #include <stdio.h>
142 
143 #include <logmath.h>
144 #include <hash_table.h>
145 #include <cmd_ln.h>
146 
147 #ifdef __cplusplus
148 extern "C" {
149 #endif
150 #if 0
151 } /* Fool Emacs into not indenting things. */
152 #endif
153 
154 #define LM_DICTWID_BADMAP -16000
155 #define LM_CLASSID_BASE 0x01000000
158 #define LM_LEGACY_CONSTANT BAD_S3LMWID
163 #define LM_SPHINX_CONSTANT BAD_S3LMWID32
172 #define LM_CLASSID_TO_CLASS(m,i) ((m)->lmclass[(i)-LM_CLASSID_BASE])
173 
174 #define MIN_PROB_F -99.0
182 #define LM_ALLOC_BLOCK 16
188 #define LM_SUCCESS 1
190 #define LM_FAIL 0
191 #define LM_NOT_FOUND -1
193 #define LM_OFFSET_TOO_LARGE -2
199 #define LM_NO_DATA_MARK -3
202 #define LM_UNKNOWN_NG -4
204 #define LM_BAD_LM_COUNT -5
206 #define LM_UNKNOWN_WORDS -6
209 #define LM_BAD_BIGRAM -7
215 #define LM_BAD_TRIGRAM -8
221 #define LM_BAD_QUADGRAM -9
228 #define LM_BAD_QUINGRAM -10
239 #define LM_BAD_NGRAM -11
245 #define LM_TOO_MANY_NGRAM -12
249 #define LM_NO_MINUS_1GRAM -13
252 #define LM_FILE_NOT_FOUND -14
254 #define LM_CANNOT_ALLOCATE -15
258 #define LMDMP_VERSIONNULL 0
264 #define LMDMP_VERSION_TG_16BIT -1
268 #define LMDMP_VERSION_TG_16BIT_V2 -2
271 #define LMDMP_VERSION_TG_32BIT -3
277 #define LMTXT_VERSION 1000
278 #define LMFST_VERSION 1001
279 #define LMFORCED_TXT32VERSION 1002
287 #define NO_WORD -1
288 
289 #include "s3types.h"
290 #include "lmclass.h"
291 #include "dict.h"
292 
293 /*
294  * ARCHAN 20050503: comment copied from Sphinx 2
295  * Bigram probs and bo-wts, and trigram probs are kept in separate tables
296  * rather than within the bigram_t and trigram_t structures. These tables
297  * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h).
298  * The following tree structure is used to construct these tables of unique
299  * values. Whenever a new value is read from the LM file, the sorted tree
300  * structure is searched to see if the value already exists, and inserted
301  * if not found.
302  */
303 
330 typedef union {
331  float32 f;
332  int32 l;
333 } lmlog_t;
335 
336 
341 typedef struct sorted_entry_s {
342  lmlog_t val;
343  uint32 lower;
346  uint32 higher;
350 
355 typedef struct {
357  int32 free;
358 } sorted_list_t;
359 
364 typedef struct {
365  s3wid_t dictwid;
369  lmlog_t prob;
370  lmlog_t bowt;
371  int32 firstbg;
372 } ug_t;
373 
378 typedef struct {
379  s3lmwid_t wid;
380  uint16 probid;
381  uint16 bowtid;
382  uint16 firsttg;
383 } bg_t;
384 
385 
389 typedef struct {
390  s3lmwid32_t wid;
391  uint32 probid;
392  uint32 bowtid;
393  uint32 firsttg;
394 } bg32_t;
395 
396 
401 typedef struct {
402  s3lmwid_t wid;
403  uint16 probid;
404 } tg_t;
405 
406 
411 typedef struct {
413  uint32 probid;
415 
416 
420 typedef struct {
421  bg_t *bg;
422  int32 used;
430 typedef struct {
431  bg32_t *bg32;
432  int32 used;
434 } membg32_t;
435 
436 
448 typedef struct tginfo_s {
449  s3lmwid_t w1;
451  int32 n_tg;
452  tg_t *tg;
453  int32 bowt;
454  int32 used;
455  struct tginfo_s *next;
456 } tginfo_t;
457 
463 typedef struct tginfo32_s {
466  int32 n_tg;
467  tg32_t *tg32;
468  int32 bowt;
469  int32 used;
470  struct tginfo32_s *next;
474 /*
475  * \struct lm_tgcache_entry_t
476  * Entries in a fast and dirty cache for trigram lookups. See lm_t.tgcache.
477  */
478 typedef struct {
479  s3lmwid_t lwid[3];
480  int32 lscr;
482 
484 /*
485  * \struct lm_tgcache_entry32_t
486  * \brief 32 bit version of lm_tg_cache_entry
487  */
488 typedef struct {
489  s3lmwid32_t lwid[3];
490  int32 lscr;
492 
495 /*
496  * A note on lm/dict/dict2lm. -ARCHAN 20050616
497  *
498  * In older versions of sphinx3 (<s3.4). dict2lm is a separate object
499  * from lm and dict. A kb actually owns a dict2lm so programer will
500  * read the lm. This seprates the initalization of lm and dict2lm and
501  * it makes a lot of sense if there is **only one** lm and **only one
502  * dict2lm.
503  *
504  * However, when multiple LMs and switching of them is required.
505  * Then, the problem of the above architecture starts to show up. For
506  * example,
507  * lmset=lm_read_ctl ();
508  * for(i=0;i<kb->n_lm;i++){
509  * dict2lmwid[i]=wid_dict_lm_map
510  * }
511  * At the same time, one will also have an array of lms (lmset[i]) for
512  * corresponding dict2lm[i]!
513  *
514  * Of course, having multiple arrays of things will somedays caused
515  * problems.
516  *
517  * The resolution is that we observed that the dict2lm map mostly
518  * changed when the lm needs to change. Also, the fact that the
519  * dictionary pronounciation itself seldom changes. That is partially
520  * caused by the fact we don't have too much research on So at the
521  * end, that is why it makes sense to let the lm to own a dict2lm.
522  *
523  * What if we also allow the dictionary to change? That is a tough
524  * question. In that case perhaps, we should still inventory of sets
525  * of lm and dict2lm and allow lm to store a pointer of dict2lm. Once
526  * there are changes in dict, programmer will be responsible to update
527  * dict2lm. (Storing pointers will allow programmers not to update
528  * everything but just lms corresponding to a particular dict.) I
529  * guess in that case it will be sign of having a wrapper that control
530  * both lm and dict together.
531  */
532 
533 /*
534  * Comments by RKM
535  * To conserve space, bg/tg probs/ptrs kept in many tables. Since the number of
536  * distinct prob values << #bg/#tg, these table indices can be easily fit into
537  * 16 bits. bgprob and bgbowt are such indices. The firsttg entry for a bigram
538  * is harder. It is supposed to be the index of the first trigram entry for each
539  * bigram. But #tg can be >> 2^16. Hence the following segmentation scheme:
540  * Partition bigrams into segments of lm_t.bg_seg_sz consecutive entries, such that
541  * #trigrams in each segment <= 2**16 (the corresponding trigram segment). The
542  * bigram_t.firsttg value is then a 16-bit relative index within the trigram
543  * segment. A separate table--lm_t.tg_segbase--has the absolute index of the
544  * 1st trigram for each segment.
545  */
547 /* Default values for lm_t.log_bg_seg.sz */
548 #define LOG2_BG_SEG_SZ 9
549 #define BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ))
550 #define LM_TGCACHE_SIZE 100003 /* A prime no. (hopefully it IS one!) */
552 /* 20040211 ARCHAN: Yes! Indeed it is a prime */
553 
559 typedef struct lm_s {
560  char *name ;
561  int32 n_ug;
562  int32 n_bg;
563  int32 n_tg;
564  int32 max_ug;
566  int32 n_ng;
568  char **wordstr;
571  uint32 log_bg_seg_sz;
572  uint32 bg_seg_sz;
573 
574  ug_t *ug;
576  /* 20040225 ARCHAN : Data structure to maintain dictionary information */
577  /* Data structure for dictionary to LM words look up mapping */
578  /* 20060306 ARCHAN: Change this to a 32 bits data structure */
579  s3lmwid32_t *dict2lmwid;
580  s3lmwid32_t startlwid;
581  s3lmwid32_t finishlwid;
583  bg_t *bg;
584  tg_t *tg;
585  membg_t *membg;
586  tginfo_t **tginfo;
589  lm_tgcache_entry_t *tgcache;
597  /**************************/
598 
599 
600  bg32_t *bg32;
601  tg32_t *tg32;
602  membg32_t *membg32;
603  tginfo32_t **tginfo32;
605  lm_tgcache_entry32_t *tgcache32;
607  /**************************/
608 
609  lmlog_t *bgprob;
610  lmlog_t *tgprob;
611  lmlog_t *tgbowt;
612  int32 *tg_segbase;
614  int32 n_bgprob;
615  int32 n_tgprob;
616  int32 n_tgbowt;
617 
618  FILE *fp;
619  int32 byteswap;
620  int32 bgoff;
621  int32 tgoff;
623  float32 lw;
624  int32 wip;
627  /* Statistics */
628  int32 n_bg_fill;
629  int32 n_bg_inmem;
630  int32 n_bg_score;
631  int32 n_bg_bo;
632  int32 n_tg_fill;
633  int32 n_tg_inmem;
634  int32 n_tg_score;
635  int32 n_tg_bo;
636  int32 n_tgcache_hit;
638  int32 access_type;
642  int32 isLM_IN_MEMORY;
645  int32 dict_size;
647  hash_table_t *HT;
650  /* Data structure that maintains the class information */
651  lmclass_t **lmclass;
652  int32 n_lmclass;
653  int32 *inclass_ugscore;
656  int32 inputenc ;
657  int32 outputenc ;
658  int32 version;
661  int32 is32bits;
663  /* Arrays of unique bigram probs and bo-wts, and trigram probs */
664  sorted_list_t sorted_prob2;
665  sorted_list_t sorted_bowt2;
666  sorted_list_t sorted_prob3;
667  int32 max_sorted_entries;
669  logmath_t *logmath;
670 } lm_t;
672 
673 
678 typedef struct lmset_s {
679  lm_t **lmarray;
680  lm_t *cur_lm;
682  int32 cur_lm_idx;
683  int32 n_lm;
684  int32 n_alloc_lm;
686 
688 #define lm_lmwid2dictwid(lm,u) ((lm)->ug[u].dictwid)
689 #define lm_n_ug(lm) ((lm)->n_ug)
690 #define lm_n_bg(lm) ((lm)->n_bg)
691 #define lm_n_tg(lm) ((lm)->n_tg)
692 #define lm_wordstr(lm,u) ((lm)->wordstr[u])
693 #define lm_startwid(lm) ((lm)->startlwid)
694 #define lm_finishwid(lm) ((lm)->finishlwid)
695 #define lm_access_type(lm) ((lm)->access_type)
701 typedef struct {
702  s3wid_t wid;
703  int32 prob;
704 } wordprob_t;
705 
762 lmset_t* lmset_init(const char* lmfile,
763  const char* lmctlfile,
764  const char* ctl_lm,
765  const char* lmname,
766  const char* lmdumpdir,
767  float32 lw,
768  float32 wip,
769  float32 uw,
770  dict_t *dict,
771  logmath_t *logmath
772  );
775 /* It is still a sore point: To have two interfaces for two different
776  type of input. Some of the code is still duplicated. Changing
777  one doesn't the other one will be changed
778 */
779 
783 lmset_t* lmset_read_lm(const char *lmfile,
784  dict_t *dict,
785  const char *lmname,
786  float64 lw,
787  float64 wip,
788  float64 uw,
789  const char *lmdumpdir,
790  logmath_t *logmath
791  );
792 
797 lmset_t* lmset_read_ctl(const char * ctlfile,
798  dict_t* dict,
799  float64 lw,
800  float64 wip,
801  float64 uw,
802  const char* lmdumpdir,
803  logmath_t *logmath
804  );
805 
810  int32 lmidx
811  );
812 
818  const char *lmname
819  );
820 
824 void lmset_set_curlm_widx(lmset_t *lms,
825  int32 lmidx
826  );
827 
832 void lmset_set_curlm_wname(lmset_t *lms,
833  const char *lmname
834  );
835 
839 int32 lmset_name_to_idx(lmset_t *lms,
840  const char *lmname
841  );
842 
848 char* lmset_idx_to_name(lmset_t *lms,
849  int32 lmidx
850  );
851 
852 
857 void lmset_add_lm(lmset_t *lms,
858  lm_t *lm,
859  const char* lmname
860  );
861 
866 void lmset_delete_lm(lmset_t *lms,
867  const char *lmname
868  );
869 
874 void lmset_free(lmset_t *lms
875  );
876 
881 int32 lm_tglist (lm_t *lmp,
882  s3lmwid32_t w1,
883  s3lmwid32_t w2,
884  tg_t **tg,
885  int32 *bowt
886  );
887 
888 int32 lm_tg32list (lm_t *lmp,
889  s3lmwid32_t w1,
890  s3lmwid32_t w2,
891  tg32_t **tg,
892  int32 *bowt
893  );
894 
899 int32 lm_bglist (lm_t *lmp,
900  s3lmwid32_t w,
901  bg_t **bg,
902  int32 *bowt
903  );
904 
905 int32 lm_bg32list (lm_t *lmp,
906  s3lmwid32_t w,
907  bg32_t **bg,
908  int32 *bowt
909  );
910 
911 
912 #if 0 /*Obsolete and it will cause conflict the code, so comment for now*/
913 /*
914  * Somewhat like lm_bglist, but fill up a wordprob_t array from the bigram list found, instead
915  * of simply returning the bglist. The wordprob array contains dictionary word IDs. But note
916  * that only the base IDs are entered; the caller is responsible for filling out the alternative
917  * pronunciations.
918  * Return value: \#entries filled in the wordprob array.
919  */
920 int32 lm_bg_wordprob(lm_t *lm,
921  s3lmwid32_t w,
922  int32 th,
923  wordprob_t *wp,
925  int32 *bowt
926  );
927 
928 #endif
929 
930 /* Return LM word ID for the given string, or BAD_LMWID(lm) if not available */
931 s3lmwid32_t lm_wid (lm_t *lm, const char *wd);
932 
936 void lm_null_struct(lm_t* lm
937  );
938 
943 int32 lm_ug_wordprob(lm_t *lm,
944  dict_t *dict,
945  int32 th,
946  wordprob_t *wp
947  );
948 
950 int32 lm_uglist (lm_t *lmp,
951  ug_t **ug
952  );
953 
954 
955 
956 /* 20040227: This also account the in-class probability of wid*/
958 int32 lm_ug_score (lm_t *lmp,
959  s3lmwid32_t lwid,
960  s3wid_t wid
961  );
962 
963 
964 int32 lm_ug_exists(lm_t* lm ,
965  s3lmwid32_t lwid
966  );
967 
968 /*
969  * Return bigram score for the given two word sequence. If w1 is BAD_LMWID(lm), return
970  * lm_ug_score (w2).
971  * 20040227: This also account for the in-class probability of w2.
972  */
973 int32 lm_bg_score (lm_t *lmp,
974  s3lmwid32_t lw1,
975  s3lmwid32_t lw2,
976  s3wid_t w2);
977 
978 
982 int32 lm_bg_exists (lm_t *lm,
983  s3lmwid32_t lw1,
984  s3lmwid32_t lw2
985  );
986 
993 int32 lm_tg_score (lm_t *lmp,
994  s3lmwid32_t lw1,
995  s3lmwid32_t lw2,
996  s3lmwid32_t lw3,
997  s3wid_t w3);
998 
999 
1003 int32 lm_tg_exists (lm_t *lm,
1004  s3lmwid32_t lw1,
1005  s3lmwid32_t lw2,
1006  s3lmwid32_t lw3
1007  );
1008 
1016 void lm_set_param (lm_t *lm,
1017  float64 lw,
1018  float64 wip
1019  );
1020 
1021 
1023 int32 lm_rawscore (lm_t *lm,
1024  int32 score
1025  );
1026 
1027 
1028 
1031 void lm_cache_reset (lm_t *lmp
1032  );
1033 
1036 void lm_cache_stats_dump (lm_t *lmp
1037  );
1038 
1048 lm_t * lm_read (
1049  const char *file,
1050  const char *lmname,
1051  cmd_ln_t *config,
1052  logmath_t *logmath);
1053 
1094 lm_t *lm_read_advance (const char *file,
1095  const char *lmname,
1096  float64 lw,
1097  float64 wip,
1098  float64 uw,
1099  int32 ndict,
1102  const char* fmt,
1106  int32 applyweight,
1108  logmath_t *logmath
1109  );
1110 
1112 lm_t *lm_read_advance2(const char *file,
1113  const char *lmname,
1114  float64 lw,
1115  float64 wip,
1116  float64 uw,
1117  int32 ndict,
1120  const char* fmt,
1124  int32 applyweight,
1126  int lminmemory,
1127  logmath_t *logmath
1128  );
1135 int32 lm_write(lm_t *model,
1136  const char *outputfile,
1137  const char *filename,
1138  const char *fmt
1139  );
1140 
1172 int32 lm_write_advance(lm_t *model,
1173  const char *outputfile,
1174  const char *filename,
1175  const char *fmt,
1176  const char* inputenc,
1177  char* outputenc
1178  );
1179 
1180 /* RAH, added code for freeing allocated memory
1181  */
1186 void lm_free (lm_t *lm
1187  );
1188 
1202 int32 lm_add_wordlist(lm_t *lm,
1203  dict_t *dict,
1206  const char* filename
1209  );
1210 
1223 int32 lm_add_word_to_ug(lm_t *lm,
1224  dict_t *dict,
1227  const char* newword
1228  );
1232 int32 lm_get_classid (lm_t *model,
1233  const char *name
1234  );
1235 
1239 void lm_convert_structure(lm_t *model,
1240  int32 is32bits
1241  );
1242 
1246 int32 lm_is32bits(lm_t* model);
1247 
1251 void ug_write(FILE* fp,
1252  ug_t* ug
1253  );
1257 void bg_write(FILE* fp,
1258  bg_t* bg
1259  );
1260 
1264 void bg32_write(FILE* fp,
1265  bg32_t* bg
1266  );
1267 
1272 void tg_write(FILE* fp,
1273  tg_t* tg
1274  );
1275 
1280 void tg32_write(FILE* fp,
1281  tg32_t* tg
1282  );
1283 
1284 
1288 void copy_bg_to_bg32(lm_t *lm
1289  );
1290 
1295 void copy_bg32_to_bg(lm_t *lm
1296  );
1297 
1301 void copy_tg_to_tg32(lm_t *lm
1302  );
1303 
1308 void copy_tg32_to_tg(lm_t *lm
1309  );
1310 
1314 void swap_bg(bg_t* bg);
1315 
1316 
1320 void swap_bg32(bg32_t* bg);
1321 
1325 void swap_tg(tg_t* tg);
1326 
1327 
1331 void swap_tg32(tg32_t* tg);
1332 
1333 int32 find_bg (bg_t *bg,
1334  int32 n,
1335  s3lmwid32_t w
1336  );
1337 
1338 int32 find_bg32 (bg32_t *bg,
1339  int32 n,
1340  s3lmwid32_t w
1341  );
1342 
1343 
1344 int32 find_tg (tg_t *tg,
1345  int32 n, s3lmwid32_t w);
1346 
1347 int32 find_tg32 (tg32_t *tg,
1348  int32 n, s3lmwid32_t w);
1349 
1350 /* Macro versions of access functions */
1351 #define LM_TGPROB(lm,tgptr) ((lm)->tgprob[(tgptr)->probid].l)
1352 #define LM_BGPROB(lm,bgptr) ((lm)->bgprob[(bgptr)->probid].l)
1353 #define LM_UGPROB(lm,ugptr) ((ugptr)->prob.l)
1354 #define LM_RAWSCORE(lm,score) ((score - (lm)->wip) / ((lm)->lw))
1355 #define LM_DICTWID(lm,lmwid) ((lm)->ug[(lmwid)].dictwid)
1356 
1360 ug_t *NewUnigramTable (int32 n_ug
1361  );
1362 
1363 
1364 #if 0
1365 { /* Stop indent from complaining */
1366 #endif
1367 #ifdef __cplusplus
1368 }
1369 #endif
1370 
1371 #endif
lm_t * lmset_get_lm_wname(lmset_t *lms, const char *lmname)
int32 lm_bglist(lm_t *lmp, s3lmwid32_t w, bg_t **bg, int32 *bowt)
lm_t * lmset_get_lm_widx(lmset_t *lms, int32 lmidx)
Definition: lm.h:641
S3DECODER_EXPORT lmset_t * lmset_init(const char *lmfile, const char *lmctlfile, const char *ctl_lm, const char *lmname, const char *lmdumpdir, float32 lw, float32 wip, float32 uw, dict_t *dict, logmath_t *logmath)
A unigram structure Please see.
Definition: lm.h:446
struct sorted_entry_s sorted_entry_t
struct lmset_s lmset_t
S3DECODER_EXPORT void lm_cache_reset(lm_t *lmp)
A bigram structure.
Definition: lm.h:460
int32 lm_tg_exists(lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3)
uint16 s3lmwid_t
Definition: s3types.h:142
void swap_bg(bg_t *bg)
Generic structure that could be used at any n-gram level.
Definition: lm.h:783
int32 s3wid_t
Definition: s3types.h:136
int32 lm_is32bits(lm_t *model)
lmset_t * lmset_read_ctl(const char *ctlfile, dict_t *dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
Structure for multiple LM, provide operations for addition/deletion/read Structure for multiple...
void copy_tg32_to_tg(lm_t *lm)
void ug_write(FILE *fp, ug_t *ug)
int32 find_tg(tg_t *tg, int32 n, s3lmwid32_t w)
struct lm_s lm_t
void lm_null_struct(lm_t *lm)
void tg32_write(FILE *fp, tg32_t *tg)
S3DECODER_EXPORT int32 lm_write(lm_t *model, const char *outputfile, const char *filename, const char *fmt)
S3DECODER_EXPORT void lmset_free(lmset_t *lms)
S3DECODER_EXPORT void lm_cache_stats_dump(lm_t *lmp)
int32 lmset_name_to_idx(lmset_t *lms, const char *lmname)
An LM class object.
S3DECODER_EXPORT void lm_free(lm_t *lm)
void bg_write(FILE *fp, bg_t *bg)
Operations on dictionary.
void lmset_set_curlm_widx(lmset_t *lms, int32 lmidx)
S3DECODER_EXPORT void lmset_set_curlm_wname(lmset_t *lms, const char *lmname)
void swap_tg(tg_t *tg)
void lmset_add_lm(lmset_t *lms, lm_t *lm, const char *lmname)
void swap_tg32(tg32_t *tg)
int32 lm_ug_score(lm_t *lmp, s3lmwid32_t lwid, s3wid_t wid)
ug_t * NewUnigramTable(int32 n_ug)
char * lmset_idx_to_name(lmset_t *lms, int32 lmidx)
Definition: lm.h:530
int32 lm_ug_exists(lm_t *lm, s3lmwid32_t lwid)
A trigram structure.
Definition: lm.h:483
Definition: lm.h:545
void swap_bg32(bg32_t *bg)
void bg32_write(FILE *fp, bg32_t *bg)
Size definition of semantically units. Common for both s3 and s3.X decoder.
int32 find_bg32(bg32_t *bg, int32 n, s3lmwid32_t w)
The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree ...
Definition: lm.h:437
#define S3DECODER_EXPORT
Definition: sphinx3_export.h:15
void copy_bg32_to_bg(lm_t *lm)
int32 find_bg(bg_t *bg, int32 n, s3lmwid32_t w)
struct tginfo_s * next
Definition: lm.h:537
single entry used in the linked list structure of lm reading
uint32 lower
Definition: lm.h:425
int32 lm_bg_score(lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2)
void lmset_delete_lm(lmset_t *lms, const char *lmname)
a structure for a dictionary.
Definition: dict.h:146
int32 lm_tg_score(lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3, s3wid_t w3)
trigram cache that enhance locating trigram for a given bigram (w_1,w_2)
int32 lm_get_classid(lm_t *model, const char *name)
32 bit version of tginfo
int32 lm_bg32list(lm_t *lmp, s3lmwid32_t w, bg32_t **bg, int32 *bowt)
struct tginfo32_s tginfo32_t
Log quantities represented in either floating or integer format.
Definition: lm.h:412
void copy_bg_to_bg32(lm_t *lm)
S3DECODER_EXPORT lm_t * lm_read_advance2(const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, int lminmemory, logmath_t *logmath)
int32 lm_write_advance(lm_t *model, const char *outputfile, const char *filename, const char *fmt, const char *inputenc, char *outputenc)
void copy_tg_to_tg32(lm_t *lm)
int32 lm_tglist(lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg_t **tg, int32 *bowt)
s3lmwid32_t lm_wid(lm_t *lm, const char *wd)
A 32 bits version of tg_t.
Definition: lm.h:493
int32 lm_bg_exists(lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
int32 lm_ug_wordprob(lm_t *lm, dict_t *dict, int32 th, wordprob_t *wp)
int32 lm_tg32list(lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg32_t **tg, int32 *bowt)
Definition: lm.h:560
Management of in-memory bigrams. Not used if all bigrams in memory.
Definition: lm.h:502
The language model. All unigrams are read into memory on initialization. Bigrams and trigrams read in...
void lm_convert_structure(lm_t *model, int32 is32bits)
Definition: lm.h:423
struct tginfo32_s * next
Definition: lm.h:552
struct tginfo_s tginfo_t
uint32 s3lmwid32_t
Definition: s3types.h:149
int32 lm_add_word_to_ug(lm_t *lm, dict_t *dict, const char *newword)
int32 lm_uglist(lm_t *lmp, ug_t **ug)
lm_t * lm_read(const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath)
Definition: lm.h:570
A bigram structure which has 32 bits.
Definition: lm.h:471
int32 lm_add_wordlist(lm_t *lm, dict_t *dict, const char *filename)
Language model class modules. This module maintains classes of words and associated probabilities (P(...
void tg_write(FILE *fp, tg_t *tg)
#define dict_size(d)
Definition: dict.h:225
Definition: lm.h:760
void lm_set_param(lm_t *lm, float64 lw, float64 wip)
S3DECODER_EXPORT int32 lm_rawscore(lm_t *lm, int32 score)
lm_t * lm_read_advance(const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, logmath_t *logmath)
lmset_t * lmset_read_lm(const char *lmfile, dict_t *dict, const char *lmname, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
int32 find_tg32(tg32_t *tg, int32 n, s3lmwid32_t w)
A 32 bits version of membg_t.
Definition: lm.h:512
lmlog_t val
Definition: lm.h:424
uint32 higher
Definition: lm.h:428