[irstlm] 25/126: added test for context dependent lm
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:41 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 62da8357811e4d4783b6fa0d3d5865d9b221ef12
Author: Marcello Federico <mrcfdr at gmail.com>
Date: Thu Jul 23 15:54:17 2015 +0200
added test for context dependent lm
---
src/test-cdlm.cpp | 526 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 526 insertions(+)
diff --git a/src/test-cdlm.cpp b/src/test-cdlm.cpp
new file mode 100644
index 0000000..f3c534b
--- /dev/null
+++ b/src/test-cdlm.cpp
@@ -0,0 +1,526 @@
+// $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit, compile LM
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include "cmd.h"
+#include "util.h"
+#include "math.h"
+#include "lmContainer.h"
+
+using namespace std;
+using namespace irstlm;
+
+/********************************/
+void print_help(int TypeFlag=0){
+ std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl;
+ std::cerr << std::endl << "USAGE:" << std::endl;
+ std::cerr << " compile-lm [options] <input-file.lm> [output-file.blm]" << std::endl;
+ std::cerr << std::endl << "DESCRIPTION:" << std::endl;
+ std::cerr << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl;
+ std::cerr << " a compiled representation that the IRST LM toolkit can quickly" << std::endl;
+ std::cerr << " read and process. LM file can be compressed." << std::endl;
+ std::cerr << std::endl << "OPTIONS:" << std::endl;
+
+ FullPrintParams(TypeFlag, 0, 1, stderr);
+}
+
+void usage(const char *msg = 0)
+{
+ if (msg) {
+ std::cerr << msg << std::endl;
+ }
+ if (!msg){
+ print_help();
+ }
+}
+
+int main(int argc, char **argv)
+{
+ char *seval=NULL;
+ char *tmpdir=NULL;
+ char *sfilter=NULL;
+
+ bool textoutput = false;
+ bool sent_PP_flag = false;
+ bool invert = false;
+ bool sscore = false;
+ bool ngramscore = false;
+ bool skeepunigrams = false;
+
+ int debug = 0;
+ bool memmap = false;
+ int requiredMaxlev = 1000;
+ int dub = 10000000;
+ int randcalls = 0;
+ float ngramcache_load_factor = 0.0;
+ float dictionary_load_factor = 0.0;
+
+ bool help=false;
+ std::vector<std::string> files;
+
+ DeclareParams((char*)
+ "text", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false",
+ "t", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false",
+ "filter", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list",
+ "f", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list",
+ "keepunigrams", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true",
+ "ku", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true",
+ "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file",
+ "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file",
+ "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+ "r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+ "score", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input",
+ "s", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input",
+ "ngramscore", CMDBOOLTYPE|CMDMSG, &ngramscore, "computes log-prob scores of the last n-gram before an _END_NGRAM_ symbol from standard input",
+ "ns", CMDBOOLTYPE|CMDMSG, &ngramscore, "computes log-prob scores of the last n-gram before an _END_NGRAM_ symbol from standard input",
+ "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+ "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+ "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+ "l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+ "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM",
+ "mm", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM",
+ "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
+ "tmpdir", CMDSTRINGTYPE|CMDMSG, &tmpdir, "directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")",
+ "invert", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false",
+ "i", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false",
+ "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)",
+ "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
+ "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
+
+ "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+ "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+
+ (char*)NULL
+ );
+
+ if (argc == 1){
+ usage();
+ exit_error(IRSTLM_NO_ERROR);
+ }
+
+ for(int i=1; i < argc; i++) {
+ if(argv[i][0] != '-'){
+ files.push_back(argv[i]);
+ }
+ }
+
+
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (help){
+ usage();
+ exit_error(IRSTLM_NO_ERROR);
+ }
+
+ if (files.size() > 2) {
+ usage();
+ exit_error(IRSTLM_ERROR_DATA,"Warning: Too many arguments");
+ }
+
+ if (files.size() < 1) {
+ usage();
+ exit_error(IRSTLM_ERROR_DATA,"Warning: Please specify a LM file to read from");
+ }
+
+ std::string infile = files[0];
+ std::string outfile = "";
+
+ if (files.size() == 1) {
+ outfile=infile;
+
+ //remove path information
+ std::string::size_type p = outfile.rfind('/');
+ if (p != std::string::npos && ((p+1) < outfile.size()))
+ outfile.erase(0,p+1);
+
+ //eventually strip .gz
+ if (outfile.compare(outfile.size()-3,3,".gz")==0)
+ outfile.erase(outfile.size()-3,3);
+
+ outfile+=(textoutput?".lm":".blm");
+ } else{
+ outfile = files[1];
+ }
+
+ std::cerr << "inpfile: " << infile << std::endl;
+ std::cerr << "outfile: " << outfile << std::endl;
+ if (seval!=NULL) std::cerr << "evalfile: " << seval << std::endl;
+ if (sscore==true) std::cerr << "interactive: " << sscore << std::endl;
+ if (ngramscore==true) std::cerr << "interactive for ngrams only: " << ngramscore << std::endl;
+ if (memmap) std::cerr << "memory mapping: " << memmap << std::endl;
+ std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
+ std::cerr << "dub: " << dub<< std::endl;
+ if (tmpdir != NULL) {
+ if (setenv("TMP",tmpdir,1))
+ std::cerr << "temporary directory has not been set" << std::endl;
+ std::cerr << "tmpdir: " << tmpdir << std::endl;
+ }
+
+
+ //checking the language model type
+ lmContainer* lmt = lmContainer::CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor);
+
+ //let know that table has inverted n-grams
+ if (invert) lmt->is_inverted(invert);
+
+ lmt->setMaxLoadedLevel(requiredMaxlev);
+
+ lmt->load(infile);
+
+ //CHECK this part for sfilter to make it possible only for LMTABLE
+ if (sfilter != NULL) {
+ lmContainer* filtered_lmt = NULL;
+ std::cerr << "BEFORE sublmC (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n";
+
+ // the function filter performs the filtering and returns true, only for specific lm type
+ if (((lmContainer*) lmt)->filter(sfilter,filtered_lmt,skeepunigrams?"yes":"no")) {
+ std::cerr << "BFR filtered_lmt (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n";
+ filtered_lmt->stat();
+ delete lmt;
+ lmt=filtered_lmt;
+ std::cerr << "AFTER filtered_lmt (" << (void*) filtered_lmt << ")\n";
+ filtered_lmt->stat();
+ std::cerr << "AFTER lmt (" << (void*) lmt << ")\n";
+ lmt->stat();
+ }
+ }
+
+ if (dub) lmt->setlogOOVpenalty((int)dub);
+
+ //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
+ lmt->init_caches(lmt->maxlevel());
+
+ if (seval != NULL) {
+ if (randcalls>0) {
+
+ cerr << "perform random " << randcalls << " using dictionary of test set\n";
+ dictionary *dict;
+ dict=new dictionary(seval);
+
+ //build extensive histogram
+ int histo[dict->totfreq()]; //total frequency
+ int totfreq=0;
+
+ for (int n=0; n<dict->size(); n++)
+ for (int m=0; m<dict->freq(n); m++)
+ histo[totfreq++]=n;
+
+ ngram ng(lmt->getDict());
+ srand(1234);
+ double bow;
+ int bol=0;
+
+ if (debug>1) ResetUserTime();
+
+ for (int n=0; n<randcalls; n++) {
+ //extracts a random word from dict
+ int w=histo[rand() % totfreq];
+
+ ng.pushc(lmt->getDict()->encode(dict->decode(w)));
+
+ lmt->clprob(ng,&bow,&bol); //(using caches if available)
+
+ if (debug==1) {
+ std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << lmt->maxlevel()-bol << "]" << " ";
+ std::cout << std::endl;
+ std::cout.flush();
+ }
+
+ if ((n % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+ }
+ std::cerr << "\n";
+ if (debug>1) PrintUserTime("Finished in");
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+
+ } else {
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ std::cerr << "Start Eval" << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ ngram ng(lmt->getDict());
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ // if (debug>0) std::cout.precision(8);
+ std::fstream inptxt(seval,std::ios::in);
+
+ int Nbo=0, Nw=0,Noov=0;
+ double logPr=0,PP=0,PPwp=0,Pr;
+
+ // variables for storing sentence-based Perplexity
+ int sent_Nbo=0, sent_Nw=0,sent_Noov=0;
+ double sent_logPr=0,sent_PP=0,sent_PPwp=0;
+
+
+ ng.dict->incflag(1);
+ int bos=ng.dict->encode(ng.dict->BoS());
+ int eos=ng.dict->encode(ng.dict->EoS());
+ ng.dict->incflag(0);
+
+ double bow;
+ int bol=0;
+ char *msp;
+ unsigned int statesize;
+
+ lmt->dictionary_incflag(1);
+
+ while(inptxt >> ng) {
+
+ if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel();
+
+ // reset ngram at begin of sentence
+ if (*ng.wordp(1)==bos) {
+ ng.size=1;
+ continue;
+ }
+
+ if (ng.size>=1) {
+ Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize);
+ logPr+=Pr;
+ sent_logPr+=Pr;
+
+ if (debug==1) {
+ std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " ";
+ if (*ng.wordp(1)==eos) std::cout << std::endl;
+ }
+ else if (debug==2) {
+ std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr;
+ std::cout << std::endl;
+ std::cout.flush();
+ }
+ else if (debug==3) {
+ std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow;
+ std::cout << std::endl;
+ std::cout.flush();
+ }
+ else if (debug==4) {
+ std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
+ std::cout << std::endl;
+ std::cout.flush();
+ }
+ else if (debug>4) {
+ std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
+ double totp=0.0;
+ int oldw=*ng.wordp(1);
+ double oovp=lmt->getlogOOVpenalty();
+ lmt->setlogOOVpenalty((double) 0);
+ for (int c=0; c<ng.dict->size(); c++) {
+ *ng.wordp(1)=c;
+ totp+=pow(10.0,lmt->clprob(ng)); //using caches if available
+ }
+ *ng.wordp(1)=oldw;
+
+ if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5))
+ std::cout << " [t=" << totp << "] POSSIBLE ERROR";
+ std::cout << std::endl;
+ std::cout.flush();
+
+ lmt->setlogOOVpenalty((double)oovp);
+ }
+
+
+ if (lmt->is_OOV(*ng.wordp(1))) {
+ Noov++;
+ sent_Noov++;
+ }
+ if (bol) {
+ Nbo++;
+ sent_Nbo++;
+ }
+ Nw++;
+ sent_Nw++;
+ if (sent_PP_flag && (*ng.wordp(1)==eos)) {
+ sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw);
+ sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw));
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_PP=" << sent_PP
+ << " sent_PPwp=" << sent_PPwp
+ << " sent_Nbo=" << sent_Nbo
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ std::cout.flush();
+ //reset statistics for sentence based Perplexity
+ sent_Nw=sent_Noov=sent_Nbo=0;
+ sent_logPr=0.0;
+ }
+
+ if ((Nw % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+
+ }
+ }
+
+ PP=exp((-logPr * log(10.0)) /Nw);
+
+ PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * log(10.0) / Nw));
+
+ std::cout << "%% Nw=" << Nw
+ << " PP=" << PP
+ << " PPwp=" << PPwp
+ << " Nbo=" << Nbo
+ << " Noov=" << Noov
+ << " OOV=" << (float)Noov/Nw * 100.0 << "%";
+ if (debug) std::cout << " logPr=" << logPr;
+ std::cout << std::endl;
+ std::cout.flush();
+
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ };
+ }
+
+ if (sscore == true) {
+
+ ngram ng(lmt->getDict());
+ int bos=ng.dict->encode(ng.dict->BoS());
+
+ int bol;
+ double bow;
+ unsigned int n=0;
+
+ std::cout.setf(ios::scientific);
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+ std::cout << "> ";
+
+ lmt->dictionary_incflag(1);
+
+ while(std::cin >> ng) {
+
+ //std::cout << ng << std::endl;;
+ // reset ngram at begin of sentence
+ if (*ng.wordp(1)==bos) {
+ ng.size=1;
+ continue;
+ }
+
+ if (ng.size>=lmt->maxlevel()) {
+ ng.size=lmt->maxlevel();
+ ++n;
+ if ((n % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+ std::cout << ng << " p= " << lmt->clprob(ng,&bow,&bol) * M_LN10;
+ std::cout << " bo= " << bol << std::endl;
+ } else {
+ std::cout << ng << " p= NULL" << std::endl;
+ }
+ std::cout << "> ";
+ }
+ std::cout << std::endl;
+ std::cout.flush();
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ }
+
+
+ if (ngramscore == true) {
+
+ const char* _END_NGRAM_="_END_NGRAM_";
+ ngram ng(lmt->getDict());
+
+ double Pr;
+ double bow;
+ int bol=0;
+ char *msp;
+ unsigned int statesize;
+
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ ng.dict->incflag(1);
+ int endngram=ng.dict->encode(_END_NGRAM_);
+ ng.dict->incflag(0);
+
+ while(std::cin >> ng) {
+ // compute score for the last ngram when endngram symbols is found
+ // and reset ngram
+ if (*ng.wordp(1)==endngram) {
+ ng.shift();
+ if (ng.size>=lmt->maxlevel()) {
+ ng.size=lmt->maxlevel();
+ }
+
+ Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize);
+#ifndef OUTPUT_SUPPRESSED
+ std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
+ std::cout << std::endl;
+ std::cout.flush();
+#endif
+ ng.size=0;
+ }
+ }
+
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ }
+
+ if (textoutput == true) {
+ std::cerr << "Saving in txt format to " << outfile << std::endl;
+ lmt->savetxt(outfile.c_str());
+ } else if (!memmap) {
+ std::cerr << "Saving in bin format to " << outfile << std::endl;
+ lmt->savebin(outfile.c_str());
+ } else {
+ std::cerr << "Impossible to save to " << outfile << std::endl;
+ }
+ delete lmt;
+ return 0;
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list