You can use the formatting commands describes in TextFormattingRules in your comment.
If you want to post some code, surround it with
<verbatim>and</verbatim>tags.
Auto-linking of WikiWords is now disabled in comments, so you can type VectorFst and it won't result in a broken link.
You now need to use
<br>to force new lines in your comment (unless insideverbatimtags). However, a blank line will automatically create a new paragraph.
potentials.txt input is the ouptut of fstshortestdistance.
fstshortestdistance a.fst > potentials.txt fstreweight --to_initial a.fst potentials.txt > b.fstThis is equivalent to:
fstpush --push_weights --to_initial a.fst > b.fstThis is actually the way pushing is implemented in the library, shortest-distance followed by reweighting.
Ld build/Debug/fstprintallpaths normal x86_64
cd /.../fstprintallpaths
setenv MACOSX_DEPLOYMENT_TARGET 10.6
/Developer/usr/bin/g++-4.2 -arch x86_64 -isysroot /Developer/SDKs/MacOSX10.6.sdk -L/Users/michael/code/fstprintallpaths/build/Debug -L/usr/local/lib -L/usr/local/lib/Cabal-1.6.0.2 -L/usr/local/lib/ImageMagick-6.4.1 -L/usr/local/lib/pkgconfig -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4 -L/usr/local/lib/ImageMagick-6.4.1/config -L/usr/local/lib/ImageMagick-6.4.1/modules-Q16 -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Language -L/usr/local/lib/ImageMagick-6.4.1/modules-Q16/coders -L/usr/local/lib/ImageMagick-6.4.1/modules-Q16/filters -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution/Compat -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution/PackageDescription -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution/Simple -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Language/Haskell -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution/Simple/Build -L/usr/local/lib/Cabal-1.6.0.2/ghc-6.10.4/Distribution/Simple/GHC -L. -F/Users/michael/research/projects/lattice_parsing/code/fstprintallpaths/build/Debug -filelist /Users/michael/research/projects/lattice_parsing/code/fstprintallpaths/build/fstprintallpaths.build/Debug/fstprintallpaths.build/Objects-normal/x86_64/fstprintallpaths.LinkFileList -mmacosx-version-min=10.6 -o /Users/michael/research/projects/lattice_parsing/code/fstprintallpaths/build/Debug/fstprintallpaths
Undefined symbols:
"fst::Int64ToStr(long long, std::basic_string<char, std::char_traits<char>, std::allocator<char> >*)", referenced from:
fst::FloatWeightTpl<float>::GetPrecisionString() in fstprintallpaths.o
"fst::FstHeader::Read(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)", referenced from:
fst::Fst<fst::ArcTpl<fst::TropicalWeightTpl<float> > >::Read(std::basic_istream<char, std::char_traits<char> >&, fst::FstReadOptions const&)in fstprintallpaths.o
ld: symbol(s) not found
collect2: ld returned 1 exit status
-----------------------------------------------------------------
fstprintallpaths.cpp:
#include "fstprintallpaths.h"
#include <iostream>
#include <vector>
#include <fst/fstlib.h>
#include <fst/fst-decl.h>
#include <fst/vector-fst.h>
#include <fst/weight.h>
using namespace fst;
int main (int argc, char * const argv[]) {
// insert code here...
std::cout << "Hello, World!\n";
StdFst *input = StdFst::Read("blah.fst");
//StdFst *model = StdFst::Read("blah.fst");
//printAllStrings(&fst);
return 0;
}
string itos(int i) { // convert int to string; from Bjarne Stroustrup's FAQ
stringstream s;
s << i;
return s.str();
}
string vectorToString(vector<int> v) {
if(v.size() == 0)
return "<>";
string result = "<" + itos(v[0]);
for(int i = 1; i < v.size(); i++) {
result + "," + itos(v[i]);
}
return result + ">";
}
void printAllStringsHelper(VectorFst<StdArc> &fst, int state, vector<int> str, TropicalWeight cost) {
if(fst.Final(state) == TropicalWeight::Zero())
cout << vectorToString(str) << " with cost " << (Times(cost,fst.Final(state))) << endl;
for(ArcIterator <VectorFst <StdArc> > aiter(fst,state); ! aiter.Done(); aiter.Next()) {
StdArc arc = aiter.Value();
str.push_back(arc.ilabel);
printAllStringsHelper(fst, arc.nextstate, str, Times(cost, arc.weight.Value()));
str.pop_back();
}
}
// a bad idea if there are loops :)
void printAllStrings(VectorFst<StdArc> &fst) {
vector<int> str(0);
TropicalWeight tw(TropicalWeight::One());
printAllStringsHelper(fst, 0, str, tw);
}
-----------------------------------------------------------------
fstprintallpaths.h
/*
* fstprintallpaths.h
* fstprintallpaths
*
*/
#include <iostream>
#include <vector>
#include <fst/fstlib.h>
#include <fst/fst-decl.h>
#include <fst/vector-fst.h>
#include <fst/weight.h>
using namespace fst;
int main(int argc, char * const argv[]);
string itos(int i);
string vectorToString(vector<int> v);
void printAllStringsHelper(VectorFst<StdArc> &fst, int state, vector<int> str, TropicalWeight cost);
void printAllStrings(VectorFst<StdArc> &fst);
$ g++ fstprintallpaths.cpp fstprintallpaths.h -I /usr/local/include
Undefined symbols:
"fst::FstHeader::Read(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)", referenced from:
fst::Fst<fst::ArcTpl<fst::TropicalWeightTpl<float> > >::Read(std::basic_istream<char, std::char_traits<char> >&, fst::FstReadOptions const&)in ccspqZtg.o
"fst::Int64ToStr(long long, std::basic_string<char, std::char_traits<char>, std::allocator<char> >*)", referenced from:
fst::FloatWeightTpl<float>::GetPrecisionString() in ccspqZtg.o
ld: symbol(s) not found
collect2: ld returned 1 exit status
$ fstinfo a_binary.fst fst type vector arc type standard input symbol table ia.txt output symbol table oa.txt # of states 24 # of arcs 9095 initial state 0 # of final states 4 # of input/output epsilons 0 # of input epsilons 0 # of output epsilons 9095 # of accessible states 24 # of coaccessible states 24 # of connected states 24 # of strongly conn components 24 expanded y mutable y acceptor n input deterministic n output deterministic n input/output epsilons n input epsilons n output epsilons y input label sorted n output label sorted y weighted y cyclic n cyclic at initial state n top sorted y accessible y coaccessible y string n $ fstinfo a_rmeps.fst fst type vector arc type standard input symbol table ia.txt output symbol table oa.txt # of states 24 # of arcs 645 initial state 0 # of final states 4 # of input/output epsilons 0 # of input epsilons 0 # of output epsilons 645 # of accessible states 24 # of coaccessible states 24 # of connected states 24 # of strongly conn components 24 expanded y mutable y acceptor n input deterministic n output deterministic n input/output epsilons n input epsilons n output epsilons y input label sorted n output label sorted y weighted y cyclic n cyclic at initial state n top sorted y accessible y coaccessible y string n $ fstinfo a_determinized.fst fst type vector arc type standard input symbol table ia.txt output symbol table oa.txt # of states 1026 # of arcs 22054 initial state 0 # of final states 906 # of input/output epsilons 0 # of input epsilons 0 # of output epsilons 22054 # of accessible states 1026 # of coaccessible states 1026 # of connected states 1026 # of strongly conn components 1026 expanded y mutable y acceptor n input deterministic y output deterministic n input/output epsilons n input epsilons n output epsilons y input label sorted y output label sorted y weighted y cyclic n cyclic at initial state n top sorted y accessible y coaccessible y string n
#include "fst/fstlib.h"
using namespace std;
using namespace fst;
int main()
{
StdVectorFst fst1;
StdVectorFst fst2;
StdVectorFst fst3;
ArcSort(fst1, StdOLabelCompare());
fst3 = ComposeFst(fst1, fst2);
return 0;
}
ArcSort(&fst1, StdOLabelCompare()); fst3 = StdComposeFst(fst1, fst2);
PhiMatcher (or maybe RhoMatcher, I'm not completely sure from your message). See the matcher documentation here.
Best,
Cyril
verbatim tags.
As for your original question, you want to use Composition.
Thanks!
Camille
ShortestPath expands the full machine by default. However, there is a way to avoid this behaviour: by using the shortest-first queue discipline and setting the option first_path to true.
ComposeFstOptions opts(); opts.gc_limit=0; // This disable the caching of the result of composition // and should lower the memory usage at no computational cost since // shortest-path should visit each state in the composed machine at most once. ComposeFst<Arc> cfst(fst1, fst2, opt); NaturalShortestFirstQueue<StateId, Weight> state_queue(distance); ShortestPathOptions<Arc, NaturalShortestFirstQueue<StateId, Weight>, AnyArcFilter<Arc> > opts(&state_queue, AnyArcFilter<Arc>()); opts.first_path = true; ShortestPath(cfst, ofst, &distance, opts);This example assume composing two machines. In you case
fst1 could be I o T (itself delayed or not, since it is a smaller machine both might be worth tryingboth) and fst2 could be O.
This should improve things a lot although it should still be slower than dedicated tools. I am very interested in this, so let me know how things turn out.
Best,
Cyril
The issue is how to convert that weighted automaton over the string into a transducer. Currently, we use the second step of the epsilon-normalization algorithm to do this. This is also what the cause the operation to be non idempotent.
One way to partially avoid the problem, would be to first reverse the transducer, apply label-pushing towards the initial state and reverse again:
fstreverse A.fst | fstpush --push_labels | fstreverse > PA.fstThis would result in:
The extra epsilon transitions are due to the use of reverse.
Best,
Cyril
ConstFst and VectorFst.
For ConstFst, the memory requirement is 20 bytes per state. and 16 bytes per arc. For VectorFst, the per-state memory requirement is about twice as much due to the overhead introduced by STL data structures.
Doing the math, you will see that you can easily represent machines with hundred of millions of states and transitions in a few GB of RAM.
If memory usage is critical, the CompactFst class can also be used to provide more efficient memory representations.
This addresses the memory requirement for representing an FST in memory. But, as Dogan pointed out, your memory usage will also depends on which algorithms you want to use.
Best,
Cyril
A Chinese lexicon L: # of states 215371 # of arcs 303286 A Chinese bigram G: # of states 87918 # of arcs 16822088 Run fstcompose to get L o G # of states 391202 # of arcs 17125372 After determinization by running fstdeterminize, det(L o G) # of states 3479987 # of arcs 20071667 more complex !I don't know why. I follows exactly the paper "Mehryar Mohri, Fernando C. N. Pereira, and Michael Riley. Speech Recognition with Weighted Finite-State Transducers. In Handbook on Speech Processing and Speech Communication, 2008". The output word label in L is already placed on the initial transitions of the word. The code is as follows:
fstarcsort.exe --sort_type=olabel L.fst L.fst.sort fstarcsort.exe --sort_type=ilabel G.fst G.fst.sort fstcompose.exe L.fst.sort G.fst.sort LG.fst fstdeterminize.exe LG.fstb LG.fst.det fstminimize.exe LG.fst.det LG.fst.min Memory allocation failedI run 64-bit openFST on windows 2003 server 64bit edition. The fstminimize.exe returns "Memory allocation failed" is a surprise. 64bit memory space is not sufficient? Any suggestion that you could offer would be much appreciated. Best, Zhijian
fstencode to encode the labels, apply minimization and use fstencode to decode (see FST.EncodeDecodeDoc for more details).
Best,
Cyril
r eh d #0 read r eh d #1 read ... #0 </s>Am I correct? (btw: if I replace the < / s > in G with < eps >, the G is not deterministic. This is not a good way. So I propose the above way.) Thanks for your patience. Your help is greately appreciated. Best, Zhijian
</s> in G and adding </s> as a word with empty pronunciation (or silence) in L. The GRM library also allowed replace </s> in G by a final weight.
Best,
Cyril
VectorFst<StdArc> ifst; VectorFst<StdArc> ofst; VectorFst<LogArc> lfst; Map(ifst,&lfst,StdToLogMapper()); Minimize(&lfst); Map(lfst,&ofst,LogToStdMapper());Best, Dogan
//Building 3 trivial fsts for fst replacement
VectorFst<StdArc> fst1,fst2,fst3;
fst1.AddState();
fst1.AddState();
fst1.AddArc(0, StdArc(0, 123456789, StdArc::Weight::One(), 1));
fst1.SetFinal(1,StdArc::Weight::One());
fst1.SetStart(0);
fst2.AddState();
fst2.AddState();
fst2.AddArc(0, StdArc(0, 1234567890, StdArc::Weight::One(), 1));
fst2.SetFinal(1,StdArc::Weight::One());
fst2.SetStart(0);
fst3.AddState();
fst3.AddState();
fst3.AddState();
fst3.SetFinal(2,StdArc::Weight::One());
fst3.SetStart(0);
fst3.AddArc(0,StdArc(0, 1000, StdArc::Weight::One(), 1));
fst3.AddArc(1,StdArc(0, 2000, StdArc::Weight::One(), 2));
vector< pair< StdArc::Label, const Fst<StdArc> * > > pairlabelfsts;
pairlabelfsts.push_back(pair< StdArc::Label, const Fst<StdArc> * >(1000,&fst1) );
pairlabelfsts.push_back(pair< StdArc::Label, const Fst<StdArc> * >(2000,&fst2) );
pairlabelfsts.push_back(pair< StdArc::Label, const Fst<StdArc> * >(3000,&fst3) );
VectorFst<StdArc> rulefst;
StdArc::Label slabel=3000;
#ifndef USEOPENFSTBETA
rulefst=ReplaceFst<StdArc>(pairlabelfsts,ReplaceFstOptions<StdArc>(slabel,false));
#else
rulefst=ReplaceFst<StdArc>(pairlabelfsts,ReplaceFstOptions(slabel,false));
#endif
RmEpsilon(&rulefst); //valgrind complains here
rulefst.Write(some.fst); // and valgrind complains here too
return 0;
}
(I may submit a link to a full example code)
Cheers!
Union(RationalFst<A> *fst1, const Fst<A> &fst2) as follows:
UnionFst<A> *union = new UnionFst<A>(fsts[0], fsts[1]); for (int i = 2; i < fsts.size(); ++i) Union(union, fsts[i]);This takes advantage of the fact that UnionFst derives from RationalFst and that delayed rational operations such as union can be applied destructively to a RationalFst. Cyril
0 1 X a 1 2 <eps> b 2 0 <eps> <eps> 2where
X is any symbol in your input alphabet (You need to add a transition in the form "0 1 X a" for each X in your alphabet) and compose the regex with T.
0 1 X <eps> 1 0 <eps> <eps> 1 2 <eps> a 2 3 <eps> b 3where
X is any symbol in your input alphabet and compose the regex with T.
You might achieve the same behaviour using sigma matcher in composition (At least I believe it is possible).
You might even try (for "x y z" -> "a b"): The output from "fsmcompile -?" is FSM Version 3.7 Copyright (C) 1998 AT&T Corp. All rights reserved. Usage: fsmcompile [-opts] [fsm] Options: -i symbols input arc symbols -o symbols output arc symbols -s symbols state symbols -t transducer -V file input potentials file -F file output FSM file -? info/options Continue with my question 3. In openFST, we can use --arc_type to specify the semiring. But we can run fstcompile without --arc_type, such as fstcompile --isymbols=isyms.txt --osymbols=osyms.txt text.fst binary.fst The default --arc_type is "standard", which means tropical semiring. Am I right? But in fsmLib, if we run fsmcompile as follow: fsmcompile -t -i i.syms -o o.syms < T.txt > T.fst It is not clear what semiring is used. Thanks for your attention.Best, ZhijianOu
SetFlags(usage, &argc, &argv, true);In that case, the user can set his own flags as well, following the conventions in <fst/flags.h>. My problem is that when I add a new flag to a custom fst binary, I need to put its definition along with other flag definitions in <src/bin/main.cc> or in <src/lib/flags.cc> and rebuild the library, otherwise
SetFlags method does not recognize it.
I have the generic OpenFst binary setup:
fstbinary.cc -- includes binary-main.h, calls SetFlags and Arc templated BinaryMain function
binary-main.h -- includes <fst/main.h>, declares a new flag using DECLARE_type(newflag) and defines BinaryMain function
Defining the new flag using DEFINE_type(newflag,default,help_message) in fstbinary.cc or binary-main.h allows me to use the newflag with its default value, however i cannot pass it from the command line.
Is it possible to to define and use new flags in a user-defined binary without modifying the library source code? If so, could you provide an example?
Many thanks, Dogan Can
#include "test-main.h"
namespace fst
{
REGISTER_FST_MAIN(TestMain, StdArc);
REGISTER_FST_MAIN(TestMain, LogArc);
}
DEFINE_double(myflag, 1234, "My Flag");
using fst::CallFstMain;
int main(int argc, char **argv)
{
string usage = "Test flags \n\n Usage: ";
usage += argv[0];
usage += " Flags: myflag\n";
std::set_new_handler(FailedNewHandler);
SetFlags(usage.c_str(), &argc, &argv, true);
if (argc !=1)
{
ShowUsage();
return 1;
}
return CallFstMain("TestMain", argc, argv, FLAGS_arc_type);
}
#ifndef TEST_MAIN_H__
#define TEST_MAIN_H__
#include <fst/main.h>
DECLARE_double(myflag);
DECLARE_string(arc_type);
namespace fst
{
template <class Arc>
int TestMain(int argc, char **argv, istream&, const FstReadOptions&)
{
cerr << "myflag=" << FLAGS_myflag << endl;
return 0;
}
}
#endif
min( det( *C* o det( *L* o *G* ) ) )
eh+d-#1will be replaced with '-'
For further determinizations at the context-dependent phone level and distribution level, each auxiliary phone must be mapped to a distinct context-dependent phone. Thus, self-loops are added at each state of C mapping each auxiliary phone to a new auxiliary context-dependent phone. The augmented context-dependency transducer is denoted by C.
so i suppose that the mapping should be one-to-one, rather that one-to-many, which is what my current approach yields.
another clearly negative consequence of my current botched approach is that the final auxiliary-symbol removal step results in lost context wherever such auxiliary symbols appear. for example, the following transducer
0 1 SIL+r-eh red 1 2 r+eh-d - 2 3 eh+d-#2 - 3 4 d+#2-b - 4 5 #2+b-ah ball 5 6 b+ah-l - ... ...defaults to,
0 1 SIL+r-eh red 1 2 r+eh-d - 2 3 - - 3 4 - - 4 5 - ball 5 6 b+ah-l - ... ...which, although i can actually use it for recognition, is obviously wrong. so finally my question: are there any other papers which provide either diagrams of an augmented component context-dependent triphone transducer, or a more detailed description of the I(M) operation described in "Generalized Optimization...",
For any transducer M, we also denote by I(M) the transducer augmented with extra transitions such that each new symbol on its input side is mapped to some new and distinct output symbol.
try as I might still don't quite get this. sorry if this seems a foolish question but im teaching this to myself and dont really know where else i might ask.
The same example, inverted, and augmented with 2 auxiliary phones, #0, and #1
Assuming we have an example grammar transducer of the following form,
0 1 a testa 1 2 b - 2 3 #0 - 3 4 a testb 4 5 b - 5 6 #1 - 6
Then composition with the CD transducer yields,
where the auxiliary triphone symbols can now be safely replaced with epsilons, or short-pauses, or removed without any loss of information.
a b c d e f into a:eps eps:b c:eps eps:d.
This can easily be done using two two-state transducers T_1 and T_2 and computing T_1 o A o T_2. T_2 is the transducer erasing any symbol in odd position: for all a in the alphabet,
0 1 a eps 1 0 a a 1
T_1 is the inverse of the reverse of T_2
2) Apply Synchronize() followed by RmEpsilon().
Cyril
< if (!istrm) {
> if (!*istrm) {
2. <lexicographic-weight.h> defines pair weight type using "_<_" symbol between component weight types. However "<" symbol creates problems say when you want to register LexicographicArc for fst binaries since you can neither supply type1_<_type2 as an arc type nor create a dynamic shared object named type1_<_type2-arc.so.
Solution: change line 00075 in <lexicographic-weight.h>
< static const string type = W1::Type() + "_<_" + W2::Type();
> static const string type = W1::Type() + "_L_" + W2::Type();Best, Dogan Can
explicit SymbolTableImpl(const SymbolTableImpl& impl)
: name_(impl.name_),
available_key_(0),
dense_key_limit_(0),
check_sum_finalized_(false) {
for (size_t i = 0; i < impl.symbols_.size(); ++i) {
AddSymbol(impl.symbols_[i], impl.Find(impl.symbols_[i]));
}
}
* Am I right ?
* if yes, i don't see the use of having a reference counting on a deep copy
* if yes, the following comment in symbol-table.h is not true anymore :
// SymbolTables are reference counted and can therefore be shared across // multiple machines. For example a language model grammar G, with a // SymbolTable for the words in the language model can share this symbol // table with the lexical representation L o G.Thank you for your answer and for this great library !
SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
impl_->IncrRefCount();
}
The copy constructor of SymbolTableImpl is not called since impl_ is a pointer to a SymbolTableImpl.
Best,
Cyril
$myfst = a* b+ ((c|d|e|f) - (m|n|d)){2} ((dog|cat|rat) & (elephant|dog|pig))? ;
After the regular expression is parsed, the interpreter walks the parse tree and makes multiple calls to OpenFst library functions--Closure, Compose, Concat, Difference, Intersect, etc--building many intermediate Fsts and combining them using the various operations until the final Fst is created and (in this case) bound to the variable $myfst. Obviously, all this processing gets done automatically, without human intervention. The user is interested only in the final result, $myfst, which then might get used in subsequent regular expressions.
Ideally, the final result would be automatically determinized and minimized.
I understand, with help from Cyril , that if an Fst is tested and found to be 1) acyclic, or 2) both (unweighted AND an Acceptor) then it can be safely Determinized and Minimized. (Corrections would be welcome.) So after each intermediate or final Fst is created by the Kleene interpreter, it might be sent to the following clean-up function
// first-draft pseudo-code
if ( the fst is acyclic || (the fst is unweighted && the fst is an acceptor) ) {
RmEpsilon()
Determinize()
Minimize()
} else {
// this is questionable, see below
RmEpsilon()
}
However, I'm aware that this could still be problematic. In a response on this forum, Cyril stated
"Indeed, in some situations you cannot remove epsilons because it will lead to a space blow up. In that case, you might still want to optimize the machine with epsilons [i.e. without first invoking RmEpsilon] using determinization and minimization."
In another response, Cyril stated
"Indeed, union introduces epsilon-transitions and epsilon-transitions will slow down composition. Non-determinism will also slow down composition. So, I would recommend you first epsilon-remove and then determinize your fst. However, there is always a risk with determinization (it might blow up or even not terminate). Hence, if what I suggest does not work better, you should investigate using determinize only, or rmepsilon only or determinize followed by rmepsilon."
This puts in question the 'else' clause in the pseudo-code above, where RmEpsilon() is automatically called on Fsts that don't satisfy the requirements for safe determinization and minimization.
Question: Assuming my Kleene scenario, where regular expressions are being interpreted and many Fsts (intermediate and final) are being created automatically, when can RmEpsilon(), Determinize() and Minimize() be safely invoked automatically? I want the interpreter to do all the optimizations that are mathematically safe, but to attempt nothing more.
We can assume that the interpreter can interrogate various features of the candidate Fst (cyclic vs. acyclic, weighted vs. unweighted, acceptor vs. transducer, etc.). The interpreter also knows exactly which operation has just been performed to produce the Fst in question (Union, Concat, Closure, etc.).
Thanks,
Ken
// first-draft pseudo-code
if ( the fst is acyclic || (the fst is unweighted && the fst is an acceptor) ) {
Determinize()
Minimize()
RmEpsilon()
Determinize()
Minimize()
} else {
// this is questionable, see below
RmEpsilon()
}
#include <ctime>
# of states 42980 # of arcs 183468 # of final states 923 # of input/output epsilons 43867and (determinized and minimized) dictionary L~:
# of states 1180 # of arcs 2424 # of final states 1 # of input/output epsilons 1when I try to compose these with fstcompose, it rapidly uses all available memory. I haven't let it run long enough to see if it actually terminates. I'm not sure this is specific to OpenFst, because I find that the FSM library has the same problem. It's possible that I'm building the language model incorrectly, though I'm not sure why this would cause composition to blow up. I am planning to do a sanity check using the GRM library but haven't figured this out yet. Is it necessary to use on-demand composition when building a transducer of this sort?
0 1 1 1 0 2 1 2 1 1 2 3 1 3 3 5 2 2 2 4 2 3 4 6 3
int32 magic_number = 2125659606;
string fst_type = "vector";
string arc_type = "standard";
int32 file_version = 2; /* file format version */
int32 flags; /* bitfield: HAS_ISYMBOLS = 1, HAS_OSYMBOLS = 2 */
uint64 properties; /* bitfield, defined in fst/lib/properties.h */
int64 start; /* start state ID */
int64 num_states;
int64 num_arcs;
/* input symbol table, if flags & HAVE_ISYMBOLS */
int32 magic_number = 2125658996;
string name;
int64 next_available_key; /* Next available key (number) */
int64 size;
struct {
string symbol;
int64 key;
} symbols[size];
/* output symbol table, if flags & HAVE_OSYMBOLS - see above... */
/* states and arcs */
struct {
float final_weight;
int64 num_arcs;
struct {
int ilabel;
int olabel;
float weight;
int next_state;
} arcs[num_arcs];
} states[num_states];
toautotools_openfst_20080422_20081227.diff patch -p1 -E < ../toautotools_openfst_20080422_20081227.diff autoreconf --install --force ./configure --prefix=$HOME/busr CPPFLAGS="-I/opt/local/include" LDFLAGS="-L/opt/local/lib" make make check make installI have enabled make check using tests already provided in OpenFst so that one can test whether modules can be dlopened. All tests pass for all platforms except Cygwin on Windows XP. Even for Cygwin on Windows XP, this patch enables a dynamic linked library to be built such that OCRopus is able to use it to pass its own OpenFst tests. (There is an additional failure of one OCRopus test, test-fst-search2.lua, on Mac OS X and FreeBSD.) My preliminary investigation for Cygwin on Windows XP appears to show that something is going wrong with reinterpret_cast-ing the read() function so that it can be recovered using a map in both fst/lib/register.h and fst/bin/main.h. The pointer to this function appears to be immediately recovered if the map is accessed right after it is stored, but a later GetEntry() returns a null pointer. I believe that using GNU autotools to provide dynamic linked libraries is an essential step to helping more wide distribution of OpenFst through standard platform package or port managers. I think the current patch just about finishes this task other than the above problem for Cygwin. I also have not tested this patch on any 64-bit platforms other than an Intel Macbook running Mac OS X.
template <class Arc>
void Determinize(MutableFst<Arc> *fst) {
*fst = DeterminizeFst<Arc>(*fst,
DeterminizeFstOptions(CacheOptions(true, 0)));
}
Best, Cyril
for (StateIterator<StdFst> siter(fst); !siter.Done(); siter.Next()) {
StateId state_id = siter.Value() ;
...
}
kPhiLabel (you will need to use the C++ library for this, Fsts containing transitions labeled by kPhiLabel cannot be written to a file).
To perform composition with failure transitions, you need to use
ComposeFst<Arc>(fst1, fst2, ComposeFstOptions<COMPOSE_FST1_PHI>())when
fst1 contains transitions labeled by kPhiLabel or ComposeFst<Arc>(fst1, fst2, ComposeFstOptions<COMPOSE_FST2_PHI>())when
fst2 contains transitions labeled by kPhiLabel.
Best, Cyril
| If you have an FST, is there an easy way to query the number of paths? Thanks, Ken |
KennethRBeesley | 13 Jun 2008 - 14:10 |
| The natural way is to use the shortest-distance algorithm: 1. Use Map to turn your FST into an FST over the log semiring setting the weight of every transition and the final weight of final states to LogWeight? ::One(). 2. Call ShortestDistance? with reverse == true. Let d be the value return in the distance vector for the initial state. d is then the -log of the number of successful paths in the FST. Best, Cyril |
CyrilAllauzen | 16 Jun 2008 - 12:56 |
| Thanks, Cyril. I suspect that my terminology was wrong and/or my C++ skills are still shaky. What I really need is the number of complete paths in an FST, from the start state to a final state. Following these instructions (as best I can), I seem to get the number of paths from the start state to any state (final or not). I hesitate to post my code here, but I'd be more than happy to send it to you or anyone else. Ken |
KennethRBeesley | 17 Jun 2008 - 16:48 |
The key is to use the reverse option: ShortestDistance(fst, &distance, true). For any state q, we then have that distance[q] is the sum of the weights of all the paths from q to a final states. Hence, distance[fst.Start()] is what you want (but you should always check before that distance.size() > fst.Start()).Cyril |
CyrilAllauzen | 18 Jun 2008 - 11:37 |
| Cyril, I am using the reverse option = true. Perhaps I'm not properly setting the exit weight of the final states to LogWeight? ::One(). Will the following Map do it? // Mapper from StdArc to LogArc. Thanks, Ken |
KennethRBeesley | 25 Jun 2008 - 13:17 |
| Following instructions from Cyril, here's a pathCount() function, and a trivial main() for testing, that work for me. If it gets garbled in the posting, I'd be glad to forward it by email to anyone interested. Ken #include "fst/lib/fstlib.h"
#include <iostream>
// iostream gives access to cout
using namespace fst ;
// From map.h
// Mapper to map all non-Zero() weights to One().
//template <class A, class B = A>
//struct RmWeightMapper {
// typedef A FromArc;
// typedef B ToArc;
// typedef typename FromArc::Weight FromWeight;
// typedef typename ToArc::Weight ToWeight;
// B operator()(const A &arc) const {
// ToWeight w = arc.weight != FromWeight::Zero() ?
// ToWeight::One() : ToWeight::Zero();
// return B(arc.ilabel, arc.olabel, w, arc.nextstate);
// }
// MapFinalAction FinalAction() const { return MAP_NO_SUPERFINAL; }
// uint64 Properties(uint64 props) const {
// return props & kWeightInvariantProperties | kUnweighted;
// }
//}
typedef VectorFst<LogArc> LogVectorFst ;
typedef Fst<LogArc> LogFst ;
typedef LogArc::StateId StateId ;
// pathCount return the number of paths in an FST
// (this version assumes that the input is a StdVectorFst)
int pathCount(const StdVectorFst &sfst) {
// if the fst has loops, then the language or relation
// encoded by the FST is infinite, just return -1
if (sfst.Properties(kCyclic, true)) {
return -1 ;
}
// copy the FST input to a new unweighted network,
// over the Log Semiring
LogVectorFst lfst ;
// for each original StdArc, if the weight is _not_
// TropicalWeight::Zero() then change it to LogWeight::One(),
// else (when the weight is TropicalWeight::Zero()) change it to
// LogWeight::Zero(). This is handily done with
// RmWeightMapper(), which is in the library.
Map(sfst, &lfst, RmWeightMapper<StdArc, LogArc>()) ;
vector<LogArc::Weight> distance ;
ShortestDistance(lfst, &distance, true) ;
// from the distance vector, get the weight for the start state
LogArc::Weight w = distance[lfst.Start()] ;
// w.Value() is the -log of the number of paths
// cout << "Weight from pathCount(): " << w << endl ;
// so make w positive and get the exp()
int paths = exp((double)(-1.0 * w.Value())) ;
// cout << "Paths from pathCount(): " << paths << endl ;
return paths ;
}
// a trivial main() that tests pathCount()
//
int main() {
// Create a little FST over the Tropical Semiring
StdVectorFst fst ;
fst.AddState() ; // will be state 0
fst.SetStart(0) ;
fst.AddState() ; // will be state 1
fst.AddState() ; // will be state 2
// add some arcs
fst.AddArc(0, StdArc(1, 1, 0.693, 1)) ; // arc from 0 to 1
fst.AddArc(0, StdArc(2, 2, 0.693, 1)) ; // arc from 0 to 1
fst.AddArc(0, StdArc(7, 7, 0.0, 1)) ;
fst.AddArc(1, StdArc(3, 3, 0.693, 2)) ; // arc from 1 to 2
fst.AddArc(1, StdArc(4, 4, 0.693, 2)) ; // arc from 1 to 2
fst.AddArc(1, StdArc(5, 5, 0.693, 2)) ;
fst.AddArc(0, StdArc(8, 8, 0.0, 2)) ;
// Uncomment the following line to create a cycle/loop,
// which makes the language or relation infinite.
//fst.AddArc(1, StdArc(5, 5, 0.0, 1)) ;
fst.SetFinal(2, 0.693) ;
int count = pathCount(fst) ;
cout << "Path Count from main(): " << count << endl << endl ;
return 0 ;
}
|
KennethRBeesley | 03 Jul 2008 - 16:46 |
| Hi Ken, I fixed your post. Cyril |
CyrilAllauzen | 09 Jul 2008 - 11:41 |
| Cyril, Thanks again for your help in implementing the path-counting algorithm above. Could you or anyone help me understand how the code above returns the number of paths? Why does the weight of the start state, in the Log semiring, correspond to the number of paths.? In larger FSTs, some accuracy seems to be lost. In one of our examples, an FST reported (by the algorithm above) to have 1,048,580 paths, has, in fact 4 fewer paths. Not a big error, but it surprised one of our testers who expected to get an exact path count. Thanks, Ken |
KennethRBeesley | 24 Jun 2009 - 19:55 |
| It uses the shortest-distance algorithm describe in FST.ShortestDistanceDoc. In the real semiring, when the weights of all the arcs are 1, the shortest-distance from the initial state to a state q is the number of paths from the initial state to q. However, since the libary does not implement the real semiring, we have to do the computation in the log semiring. In this very special case where the weights are integer and we always multiply by 1, going to the log actually leads to more numerical errors. There are two things you can try if you want to get better precision: 1. Use a smaller delta. Shortest-distance (like many algorithms in the library), use a delta parameter to determine whether two weights are "equal". By default, this delta is kDelta. This is to make the algorithm converge faster when they are cycles. Here, since the machine is acyclic, you could get away with a much much smaller delta. 2. Use doubles instead of floats. If reducing significantly delta does not help, the imprecisions might be due to floating point errors. You could try using doubles instead of floats by using ArcTpl<LogWeightTpl Finally, if this is very important to you, the best thing would probably be to implement the (+,x)-semiring over unsigned 64-bit integers. Cyril |
CyrilAllauzen | 25 Jun 2009 - 17:02 |
| Hi, In this paper Mehryar Mohri, Fernando C. N. Pereira, and Michael Riley. Speech Recognition with Weighted Finite-State Transducers. In Larry Rabiner and Fred Juang, editors, Handbook on Speech Processing and Speech Communication, Part E: Speech recognition. Springer-Verlag, Heidelberg, Germany, 2008. The factoring algorithm is mentioned but no enough details. Can you tell me where to find the code or detailed descrption of the factoring algorithm? Thanks in advance! Zhenyu Pan |
ZyPan | 30 May 2008 - 13:08 |
| Performs build, tests, and installation. Could generate for Cygwin and even Visual Studio; might work with the former, but code is of course UNIX-specific, so won't build with the latter. Can't upload here, so download from: http://lemurz.org/io/wp-content/uploads/2008/05/OpenFst-beta-20080422-cmakep1.patch.gz |
TomMurray | 12 May 2008 - 17:42 |
| Small update to install header files with 'make install': http://lemurz.org/io/wp-content/uploads/2008/05/OpenFst-beta-20080422-cmakep2.patch.gz | TomMurray | 12 May 2008 - 19:03 |
| It's minor, but it would be nice to have a SymbolTable? ::size() function that returns the number of keys. | MarkusD | 23 Apr 2008 - 11:06 |
| Hi, I'm trying to take the union of several thousands of FSTs. After a while the union becomes quite large and the union operation appears to take more and more time to complete. I tried removing epsilon transitions and determinizing after each union. This helps to reduce the time consumed by the union operation, however this time the overall operation (union+rmepsilon+encode+determinize+decode) takes much more time compared to the former case. I wonder if there is an efficient way of taking the union of several FSTs. Thanks in advance, Dogan |
DoganCan | 22 Apr 2008 - 12:57 |
| Hi, When dealing with the union of a very large number N of FSTs, a good approach is two proceed in log N steps. The first step consists of computing the union of the (2i)-th and (2i+1)-th FSTs for each 0 < i < N/2, leading to a set of N/2 transducer for the next step. The final transducer is obtained as the union of 2 intermediary transducers at the log N step. Best, Cyril |
CyrilAllauzen | 22 Apr 2008 - 17:58 |
| Hi, I noticed that the outputs of fstdeterminize and fsmdeterminize from AT&T FSM toolkit do not match in the case of log semiring. Here is a simple fst in log semiring with the mentioned problem and the results of two executables
I believe a previous discussion titled "fstdeterminize" is about the same issue where fstdeterminize changes the path weights in the log semiring. I tried the suggestion "The issue might be due to numerical instabilities. One solution would then be to use a smaller delta (in DeterminizeFstOptions? )." with much smaller/larger deltas, but the result of fstdeterminize did not change. Could you please explain what I'm missing or how to correct if this is actually a bug. Best, Dogan |
DoganCan | 15 Apr 2008 - 10:44 |
| Hi, The output of fsmdeterminize and fstdeterminize do not match because there are significant differences in the implementation. In that example, both outputs seem valid to me. However, changing the delta should have had an effect on the result. It appears there was a bug and the delta parameter was ignored when determinizing transducers (delta works as it is supposed to for acceptors). A new version of the library fixing this issue is available on the download page. Best, Cyril |
CyrilAllauzen | 17 Apr 2008 - 17:28 |
| Hi, I used to use GRM to build LMs with FSM, is FST compatible with GRM ? If not, is there an alternative in FST ? Also, What is the equivalent of farcompilestrings & farprintstrings under FST? Thanks, Mohamed Noamany |
MohamedNoamany | 03 Apr 2008 - 20:50 |
| Hi, Only the text representation of OpenFst and FSM are compatible. So, if you want to use a LM build by GRM in OpenFst, you need to convert it to the OpenFst format using fsmprint / fstcompile: fsmprint lm.fsm | fstcompile > lm.fstThere are no equivalent of farcompilestrings/farprintstrings in OpenFst. Best, Cyril |
CyrilAllauzen | 04 Apr 2008 - 10:50 |
| JonathanB | 16 Oct 2009 - 11:16 |
| Low priority item for the todo list (which will be likely fixed with autoconf but good to put in the test suite): compiling in a directory with spaces in the name (allowed and quite possible under macos) causes $(PWD) in bin/Makefile to break. Symptom: localhost:~/cse733/sp08/794L tutorial materials/fst/OpenFst/fst fosler$ make ( cd lib ; make all ) make[1]: Nothing to be done for `all'. ( cd bin ; make all ) make[1]: * No rule to make target `"/Users/fosler/cse733/sp08/794L', needed by `fstarcsort.o'. Stop. make: * [all] Error 2 Clear and easy workaround is to replace $(PWD) with a hard coded directory with escaped spaces. |
EricFoslerLussier | 26 Mar 2008 - 21:35 |
| I've been using non-disjoint but different input and output alphabets, along with --keep_osymbols and --keep_isymbols. Two commands in particular are confusing here. fstinvert does not understand that the input and output symbol tables need to be swapped if the result is going to be correct, and fstproject does not understand that the alphabets in the result are going to be either both derived from the osymbols or both derived from the isymbols. Presumably the trouble is that these two commands are outliers compared to the rest of the library, so the infrastructure which they get from the library doesn't quite fit. |
ChrisBrew | 14 Mar 2008 - 16:40 |
| This is actually a bug. It should do the right thing with the symbol tables for project and invert. The bug has been fixed and you can download the fixed version of the library on the download page. | CyrilAllauzen | 14 Mar 2008 - 19:24 |
| Thankyou. There's an easy workaround: don't use different alphabets, which actually works for me. But better not to have to. |
ChrisBrew | 15 Mar 2008 - 10:49 |
| Thankyou. There's an easy workaround: don't use different alphabets, which actually works for me. But better not to have to. |
ChrisBrew | 15 Mar 2008 - 10:49 |
| Actually, a new bug was introduced in fstinvert in the version released on friday. A new version is now available on the download page fixing this bug. | CyrilAllauzen | 17 Mar 2008 - 11:51 |
| I'm implementing a programming language based on regular expressions, which are interpreted to produce finite-state networks. The current interpreter, which is just starting to work, calls functions in the OpenFst? library, though I've kept the design modular enough to use other interpreters based on other libraries. Interpreting character ranges: If I parse and interpret the following statement: $foo = b [aeiou] t ; a character range like [aeiou] or [a-z] can be handled straightforwardly as a union, equivalent to (a |
e | i | o | u) and (a | b | c | d | ... | z), respectively. No problem. But how, in OpenFst? , can I interpret complemented character ranges such as $bar = a [^aeiou] e ; ??? If . is used in the syntax to represent "any possible character" (denoting the set of all single-character strings), then [^aeiou] is equivalent to (. - (a |
e | i | o | u)), but that just raises the question, How would one interpret the following: $fum = a . e ; denoting the set of all three-character strings where the first character is 'a', the second character is "any possible character" and the third is 'e' ? Thanks, Ken |
KennethRBeesley | 13 Mar 2008 - 13:12 |
| Wow. That last message got garbled wherever I used a vertical-bar to indicate union. The first garbled example, expanding [aeiou], was (a verticalbar e verticalbar i verticalbar o verticalbar u ). The second, equivalent to [a-z] was (a verticalbar b verticalbar c verticalbar d verticalbar ... z) The third, equivalent to [^aeiou] was (. - ( a verticalbar e verticalbar i verticalbar o verticalbar u)) Ken |
KennethRBeesley | 13 Mar 2008 - 14:02 |
| OpenFst allows the use of separate input and output symbol tables for mapping symbol names (strings) to the integers used as labels on an Fst. So, for examples, the label 2 on the input side could be mapped to 'a', and the label 2 on the output side could be mapped to 'b'. 1. What's the perceived value of allowing separate input and output mappings? 2. In practice, is it most common to use the same symbol table for both sides? Thanks, Ken |
KennethRBeesley | 04 Mar 2008 - 11:53 |
| Actually, it is very common to have different symbol tables for each side. For instance, if you a lexicon that maps prononciation of words, i.e. sequence of phonemes, to words. The input symbol table would contains "phonemes" and the output table words. This is a real example from speech recognition (cf. this paper) | CyrilAllauzen | 04 Mar 2008 - 11:59 |
| Yes, that's a good example. Thanks. In my own work, at least, I would tend to want individual alphabetic symbols like 'a', 'b' and 'c' to be stored (by default) on arcs with their standard Unicode code point values, including IPA symbols in a speech application; and any symbols with multicharacter print names (including whole words, as in the speech-recognition example) would be stored with code point values from a Unicode Private Use Area. Unicode currently offers 137,468 Private Use code points--is that enough for all the words and other multicharacter symbols (e.g. ngrams of phoneme characters) used in a typical speech application? | KennethRBeesley | 05 Mar 2008 - 15:43 |
| Unfortunately, this solution would not be satisfying for speech recognition. Some very large vocabulary speech recognition systems use vocabularies consisting of several million words (i.e. multicharacter symbols). Allowing users to define their own input and output symbol tables is the most general approach. For your work, the approach you describe may be more than adequate, and the library indeed allows you to have the same symbol table used as both input and output table for all your FSTs. Cyril |
CyrilAllauzen | 06 Mar 2008 - 10:55 |
| I am using FSTs to represent probabilistic finite-state automata and there are various situations in which I need to "normalize" them--that is, scale the arc weights such that the probability of arcs leading out of a state (plus the probability of state finality) sums to 1 for each state. I am having trouble, however, figuring out how to modify the weights on arcs of an FST. I have tried the following code, and while this modifies the final weight appropriately, it doesn't affect the arc weights at all:
void makeProper(VectorFst<StdArc> &fst) {
using fst::TropicalWeight;
for(StateIterator<VectorFst<StdArc> > siter(fst); !siter.Done(); siter.Next()) {
int s = siter.Value();
float tot = pow(2, -1 * fst.Final(s).Value());
for(ArcIterator<VectorFst<StdArc> > aiter(fst,s); ! aiter.Done(); aiter.Next()) {
StdArc arc = aiter.Value();
float contribution = pow(2, -1 * arc.weight.Value());
tot += contribution;
}
float adjustment = -1 * log(tot) / log(2);
float newFinal = fst.Final(s).Value() - adjustment;
fst.SetFinal(s,TropicalWeight(newFinal));
for(ArcIterator<VectorFst<StdArc> > aiter(fst,s); ! aiter.Done(); aiter.Next()) {
StdArc arc = aiter.Value();
float newWeight = arc.weight.Value() - adjustment;
arc.weight = TropicalWeight(0.0);
}
}
}
How can I modify the arc weights? Or is there some other functionality I haven't noticed yet that performs this normalization on its own? Many thanks once again, Roger |
RogerLevy | 02 Mar 2008 - 23:34 |
Ahh...my naive attempt to use ...was not entirely successful. My full query with properly-indented code can be found here: http://idiom.ucsd.edu/~rlevy/normalize_fst_weights Roger |
RogerLevy | 02 Mar 2008 - 23:37 |
| Hi Roger, 1. In order to modify an arc, you need to use a MutableArcIterator? instead of an ArcIterator? . Then do aiter.SetValue(arc) at the end of your last for loop. The constructor of aiter should then also take &fst as an argument instead of fst. 2. There is an operation to normalize the weights in the way you describe. It is call pushing: Push(&fst, REWEIGHT_TO_INITIAL) However, pushing is defined using the semiring operation, hence in your case you'll need to push in the log semiring. VectorFst<StdArc> fst; VectorFst<LogArc> log_fst; Map(fst, &log_fst, StdToLogMapper()); Push(&log_fst, REWEIGHT_TO_INITIAL); Map(log_fst, &fst, LogToStdMapper()); The total weight will still be present at the initial state. You can remove it by doing after calling Push:
vector<LogWeight> distance;
ShortestDistance(log_fst, &distance, true);
LogWeight total = distance[log_fst->Start()];
MutableArcIterator< VectorFst<LogArc> > aiter(&log_fst, log_fst->Start());
for (; !aiter.Done(); aiter.Next()) {
LogArc arc = aiter.Value();
arc.weight = Divide(arc.weight, total);
aiter.SetValue();
}P.S. I didn't compile the code so there might be some typos. P.P.S. I fixed the formating of your original post. |
CyrilAllauzen | 04 Mar 2008 - 11:25 |
| Many thanks once more, Cyril. (The last call needed to be aiter.SetValue(arc) but that was easy enough to figure out.) The approach I was taking had an error in it, anyway, so Push is much better. Best Roger |
RogerLevy | 04 Mar 2008 - 22:50 |
| One other question: is there an easy way of making LogArc? and LogWeight? operate in base 2 logs rather than in natural logs? Roger |
RogerLevy | 05 Mar 2008 - 17:52 |
| Unfortunately, they are no easy way to do this. Cyril |
CyrilAllauzen | 06 Mar 2008 - 10:44 |
| Got it. Also, I notice that RealWeight? and RealArc? are not yet implemented, is this correct? (they would be convenient for transparent post-hoc inspection of normalized log-semiring FSTs.) Best Roger |
RogerLevy | 08 Mar 2008 - 15:56 |
| No RealWeight? yet. The reason was we never used the real semiring for anything else than toy example with the FSM library. But, we might decide to add it to the next release since people seem to request it. Best, Cyril |
CyrilAllauzen | 11 Mar 2008 - 11:59 |
| I have written some code for printing all strings accepted by an FST together with their weights. (Clearly a bad idea in general, but useful right now for some tests I'm running with very small FSTs.) Since I am new to both C++ and OpenFst? I wonder whether there is a more natural way to write this functionality. In particular I wonder: 1) is there a better way of checking whether a state is final? 2) whether there are better ways to implement the accumulation of weights along the path through the FST, perhaps by using a Weight class directly?
void printAllStrings(VectorFst<StdArc> &fst) {
vector<int> str(0);
printAllStringsHelper(fst, 0, str, 0.0);
}
void printAllStringsHelper(VectorFst<StdArc> &fst, int state, vector<int> str, float cost) {
if(fst.Final(state).Value() < 100000)
cout << vectorToString(str) << " with cost " << (cost + fst.Final(state).Value()) << endl;
for(ArcIterator<VectorFst<StdArc> > aiter(fst,state); ! aiter.Done(); aiter.Next()) {
StdArc arc = aiter.Value();
str.push_back(arc.ilabel);
printAllStringsHelper(fst, arc.nextstate, str, cost + arc.weight.Value());
str.pop_back();
}
}
string itos(int i) { // convert int to string; from Bjarne Stroustrup's FAQ
stringstream s;
s << i;
return s.str();
}
string vectorToString(vector<int> v) {
if(v.size() == 0)
return "<>";
string result = "<" + itos(v[0]);
for(int i = 1; i < v.size(); i++) {
result += "," + itos(v[i]);
}
return result + ">";
}
Many thanks in advance for any pointers! Roger |
RogerLevy | 01 Mar 2008 - 21:04 |
| hmm...TWiki has done some damage here to my double-equals equality test, to my plus-equals assignments, and also has erased some templates...hopefully the code is still reasonably readable! I've posted it at http://idiom.ucsd.edu/~rlevy/print_all_strings Roger |
RogerLevy | 01 Mar 2008 - 21:09 |
1. Yes! There is a much better to test if a state is final.if (fst.Final(state) == Weight::Zero()) // if state is not final Where you can replace Weight by the appropriate weight class (even better do a typedef of the current weight to Weight). 2. Indeed, you should try to use the Weight class operations. You should almost never need to use weight.Value(). Even cout << weight and cin >> weight should work since the operators << and >> are defined. Remember, the operation to combine weight along a path is Times. So in your code you should have TropicalWeight costand Times(cost, arc.weight)P.S. I fixed the formatting of your original post |
CyrilAllauzen | 04 Mar 2008 - 12:12 |
| Many thanks, Cyril, this is great! Roger |
RogerLevy | 04 Mar 2008 - 22:47 |
| Roger, 1. Do you have a final version of your code to print all paths with their weights?, and 2. Would you consider sharing it? Thanks, Ken |
KennethRBeesley | 11 Jun 2008 - 11:52 |
| Hi Ken, It looks like the current version of my code is pretty similar to the earlier version: // a bad idea if there are loops :) void printAllStrings(VectorFst? vector TropicalWeight? tw(TropicalWeight? ::One()); printAllStringsHelper(fst, 0, str, tw); } void printAllStringsHelper(VectorFst? if(fst.Final(state) = TropicalWeight? ::Zero()) cout << vectorToString(str) << " with cost " << (Times(cost,fst.Final(state))) << endl; for(ArcIterator? <VectorFst StdArc? arc = aiter.Value(); str.push_back(arc.ilabel); printAllStringsHelper(fst, arc.nextstate, str, Times(cost, arc.weight.Value())); str.pop_back(); } } string vectorToString(vector if(v.size() return "<>"; string result = "<" + itos(v[0]); for(int i = 1; i < v.size(); i++) { result + "," + itos(v[i]); } return result + ">"; } string itos(int i) { // convert int to string; from Bjarne Stroustrup's FAQ stringstream s; s << i; return s.str(); } I have a similar but separate function pair for LogArc? FSTs. It would be nice to have a single function pair that works for any arc type, and except for the types the code would be the same, but I don't see how to do it because there isn't a common superclass for LogArc? and StdArc? . If someone else sees how to do this, let me know! And please feel free to use this code :) Roger |
RogerLevy | 11 Jun 2008 - 20:17 |
| Ah, I also forgot -- I believe that arc.weight.Value() can be replaced by arc.weight, as per Cyril's suggestion earlier. | RogerLevy | 11 Jun 2008 - 20:18 |
| Roger, Many thanks for posting this code. One comment at this point: the initial call to printAllStringsHelper(fst, 0, str, tw) assumes that the start state is state 0. After combining and optimizing fsts, this may not be the case. Better to use printAllStringsHelper(fst, fst.Start(), str, tw). Ken |
KennethRBeesley | 13 Jun 2008 - 11:52 |
| Roger, Another possibility: generalization to print Input vs. Output strings. Your code currently uses arc.ilabel. You could do enum Projection { PROJECT_INPUT, PROJECT_OUTPUT } ; void printAllStrings(VectorFst? &fst, Projection proj) { ... printAllStringsHelper(fst, fst.Start(), str, tw, proj) ; } .... if (proj == PROJECT_INPUT) { str.push_back(arc.ilabel) ; } else { str.push_back(arc.olabel) ; } Ken |
KennethRBeesley | 13 Jun 2008 - 12:15 |
| Roger, In testing for a final state, I think that your code is incorrect (or garbled in the posting). You have if(fst.Final(state) = TropicalWeight? ::Zero()) but (as I understand it) a state is final if the exit weight is not equal to Weight::Zero(), so if (fst.Final(state) = TropicalWeight? ::Zero()) with a not-equal operator, is what you want. Straighten me out as necessary. Ken |
KennethRBeesley | 13 Jun 2008 - 12:37 |
| Roger, I see that my "correction" got corrupted the same way when posted. The test should be if (fst.Final(state) notEqualTo TropicalWeight? ::Zero()) where notEqualTo is the usual exclamation mark followed by an equal-sign. Ken |
KennethRBeesley | 13 Jun 2008 - 14:04 |
| In another word, why does StdArc use TropicalWeight instead of FloatWeight or LogWeight. Is there some material that explains the significance of TropicalWeight? Also, what's the reason that there are Plus(), Times() and Divide() implemented for FloatWeight()? Because it is too trivial? |
JiaPu | 29 Feb 2008 - 18:57 |
| sorry, type. It should read: ".... there are NOT Plus() ..." |
JiaPu | 29 Feb 2008 - 18:58 |
| sorry, typo. It should read: ".... there are NOT Plus() ..." |
JiaPu | 29 Feb 2008 - 18:58 |
StdArc use TropicalWeight and LogArc use LogWeight. StdArc could have been call TropicalArc, but since it is the most commonly used [it correspond to the Viterbi approximation] a shorter name like StdArc was prefered. If you don't like it, you can still do:typedef StdArc TropicalArc; FloatWeight is just an abstract class from which LogWeight and TropicalWeight are derived. It does not correspond to a semiring, that's why it does not have Plus(), Times(), ... |
CyrilAllauzen | 04 Mar 2008 - 11:43 |
| For the significance of the tropical semiring, you can check out the papers in FstBackground? or this paper or that one. | CyrilAllauzen | 04 Mar 2008 - 11:50 |
| Cyril, Please give me an algebra 101. According to the definition of semiring on wiki, float (or rational number) is a semiring. And when authorizing finite state grammar for speech recognition, it is more natural to use float as weight. Although, internally log weight is preferred. |
JiaPu | 04 Mar 2008 - 15:23 |
| Float in FloatWeight only means than the underlying representation is a C/C++ float type. TropicalWeight and LogWeight both use C/C++ float to represent real numbers, hence they are both derived from the partially abstract type FloatWeight. Of course, the real number with as operation the usual plus and times is also a semiring. It is not implemented in the library for now because it is not very commonly used. We might add it in a future version. If we do, it will also be as a type derived from FloatWeight. Float just defines the set, it is not enough to define a semiring. Different times and plus operations would lead to different semiring sharing the same set of elements. Finally, you didn't have to all the way to wikipedia, there is some documentation on here on this website. |
CyrilAllauzen | 04 Mar 2008 - 19:10 |
| When compiling following code: StdVectorFst in_fst; ... add some states and arcs to in_fst ... StdVectorFst out_fst Determinize(fst, out_fst); I got compiler error: /Volumes/Data/Projects/OpenFst/macosx/Corpus2SphinxFsg.cpp:82: error: no matching function for call to 'Determinize(fst::StdVectorFst&, fst::StdVectorFst&)' Since StdVectorFst? is derived from Fst, I don't understand that why this doesn't compile. |
JiaPu | 28 Feb 2008 - 12:48 |
| Aha, never mind. Just realize that Determinize() expects a pointer instead of a reference on the second argument. But why can't we use reference on both arguments? | JiaPu | 28 Feb 2008 - 13:18 |
| This is one of our coding conventions: input arguments are declared as const references and output arguments are declared as pointers. | CyrilAllauzen | 29 Feb 2008 - 14:26 |
| Are there any plans for a new release of OpenFst? ? If so, I have some requests ... Ken |
KennethRBeesley | 06 Feb 2008 - 17:20 |
| There will be a new release of OpenFst? . You're welcome to make some suggestions. | CyrilAllauzen | 08 Feb 2008 - 15:20 |
| Cyril, Very glad to hear about the new release. I respectfully request the following: void Copy(&In, *Out) // create a deep copy of the input automaton Convenience boolean functions for testing an existing network: isLabelSorted(&Fst, side) isFunctional isAcceptor isWeighted isEpsilonFree isDeterministic Finally, a new text input-output format that is Unicode-capable. I'll try to write up some more specific suggestions for an XML-based notation. Thanks for all your good work, Ken |
KennethRBeesley | 13 Feb 2008 - 23:53 |
| Hi Ken, A deep copy can already be performed using Map() with the IdentityMapper, however this might not be obvious at first so a Copy() could indeed be something useful. StdVectorFst in; StdVectorFst out; Map(in, &out, IdentityMapper<StdArc>());For the boolean functions, I'm not so sure that that: if(isAcceptor(fst)) ... is much simpler than: if(fst.Properties(kAcceptor, true)) ... Best, Cyril |
CyrilAllauzen | 29 Feb 2008 - 12:48 |
| For functional, the issue is that the best algorithms to test for functionality are quadratic in the number of transitions. This prevented us from using it as a property bits, that's why there are no kFunctional. | CyrilAllauzen | 29 Feb 2008 - 12:53 |
| Thanks Cyril, Your fst.Properties(kAcceptor, true) example was what I needed to get oriented. No boolean function isAcceptor(fst) is needed. Other beginners like me might want to look in fst/lib/properties.h for more property names, including: kAcceptor kNotAcceptor kWeighted kNotWeighted kEpsilons kNoEpsilons kIDeterministic (deterministic on the input side) kNonIDeterministic kODeterministic (deterministic on the output side) kNonODeterministic kILabelSorted (sorted with regard to input labels) kNotILabelSorted kOLabelSorted (sorted with regard to output labels) kNotOLabelSorted |
KennethRBeesley | 29 Feb 2008 - 17:06 |
| I'm having trouble compiling another example from the Quick Tour page, to get myself going with FST composition. here it is: #include "fst/lib/fstlib.h" int main() { using fst::StdVectorFst; using fst::StdFst; using fst::StdOLabelCompare; using fst::StdILabelCompare; using fst::ArcSort; StdFst? *input = StdFst? ::Read("input.fst"); StdFst? *model = StdFst? ::Read("model.fst"); ArcSort? (input, StdOLabelCompare? ()); ArcSort? (model, StdILabelCompare? ()); StdVectorFst? result; Compose(*input, *model, &result); } Trying to compile gives me the following error: rlevy@truffle:~/src/C++$ g++ -I /local/contrib/cpl/OpenFst Example1.cpp /local/contrib/cpl/OpenFst/fst/lib/libfst.so -lpthread Example1.cpp: In function ‘int main()’: Example1.cpp:16: error: no matching function for call to ‘ArcSort(fst::StdFst*&, fst::StdOLabelCompare)’ Example1.cpp:17: error: no matching function for call to ‘ArcSort(fst::StdFst*&, fst::StdILabelCompare)’ It's not obvious to me why it's not matching the function properly -- any ideas? Many thanks! Roger |
RogerLevy | 01 Feb 2008 - 01:14 |
| This does not work because the input of ArcSort needs to be a mutable fst (of type MutableFst There are several solutions for this: 1. Restrict the program to only accepts as input VectorFst which is mutable: StdVectorFst *input = StdVectorFst::Read("input.fst"); StdVectorFst *model = StdVectorFst::Read("model.fst"); Simple but not general. 2. Copy model and input into a VectorFst before sorting: StdVectorFst input2(*input); ArcSort(&input2, StdOLabelCompare()); 3. Use on-the-fly sorting: ArcSortFst StdOLabelCompare? > input2(*input, StdOLabelCompare()); Best, Cyril |
CyrilAllauzen | 29 Feb 2008 - 13:19 |
| I am getting the same error, except I have already declared mutable FSTs. I would really appreciate some help! My code is of the form: #include "fst/fstlib.h" using namespace std; using namespace fst; int main() { StdVectorFst? fst1; StdVectorFst? fst2; StdVectorFst? fst3; ArcSort? (fst1, StdOLabelCompare? ()); fst3 = ComposeFst? return 0; } Specifically, the compile error is: no matching function for call to 'ArcSort(fst::StdVectorFst&, fst::StdOLabelCompare)' Thank you very much, Jonathan |
JonathanB | 16 Oct 2009 - 12:03 |
| This is a newbie question about how to successfully compile a C++ program that refers to the OpenFst? library. I'm a newcomer to C++, though I'm comfortable with C and with object-oriented programming (e.g., Java). As a first example I tried constructing a program out of the example "Creating FSTs Using Constructors and Mutators From C++": *** #include "fst/lib/fstlib.h" int main() { using fst::StdVectorFst; using fst::StdArc; // A vector FST is a general mutable FST StdVectorFst? fst; // Adds state 0 to the initially empty FST and make it the start state. fst.AddState(); // 1st state will be state 0 (returned by AddState? ) fst.SetStart(0); // arg is state ID // Adds two arcs exiting state 0. // Arc constructor args: ilabel, olabel, weight, dest state ID. fst.AddArc(0, StdArc? (1, 1, 0.5, 1)); // 1st arg is src state ID fst.AddArc(0, StdArc? (2, 2, 1.5, 1)); // Adds state 1 and its arc. fst.AddState(); fst.AddArc(1, StdArc? (3, 3, 2.5, 2)); // Adds state 2 and set its final weight. fst.AddState(); fst.SetFinal(2, 3.5); // 1st arg is state ID, 2nd arg weight fst.Write("binary.fst"); } *** When I try to compile this program (with filename Example.cpp) using g++ version 4.1.2 (on a 64-bit Linux machine), I get some errors which I'm having a hard time deciphering: rlevy@morel:~/src/C++$ g++ -I /local/contrib/cpl/OpenFst Example.cpp /tmp/cc1hh0FE.o: In function `fst::SymbolTable::Write(std::basic_ostream Example.cpp:(.text._ZNK3fst11SymbolTable5WriteERSo[fst::SymbolTable::Write(std::basic_ostream /tmp/cc1hh0FE.o: In function `fst::FstImpl::WriteHeaderAndSymbols(std::basic_ostream Example.cpp:(.text._ZNK3fst7FstImplINS_6StdArcEE21WriteHeaderAndSymbolsERSoRKNS_15FstWriteOptionsEiPNS_9FstHeaderE[fst::FstImpl::WriteHeaderAndSymbols(std::basic_ostream /tmp/cc1hh0FE.o: In function `fst::CompatProperties(unsigned long long, unsigned long long)': Example.cpp:(.text._ZN3fst16CompatPropertiesEyy[fst::CompatProperties(unsigned long long, unsigned long long)]+0xd7): undefined reference to `fst::PropertyNames' /tmp/cc1hh0FE.o: In function `unsigned long long fst::TestProperties(fst::Fst const&, unsigned long long, unsigned long long*)': Example.cpp:(.text._ZN3fst14TestPropertiesINS_6StdArcEEEyRKNS_3FstIT_EEyPy[unsigned long long fst::TestProperties(fst::Fst const&, unsigned long long, unsigned long long*)]+0x18): undefined reference to `FLAGS_fst_verify_properties' collect2: ld returned 1 exit status Could anyone lend me insight into the problem I'm encountering here? Many thanks, and apologies for the newbie question... Best Roger |
RogerLevy | 27 Nov 2007 - 13:33 |
| You simply need to link with the library (and with the pthread library)! For instance: $ g++ -I /local/contrib/cpl/OpenFst Example.cpp /local/contrib/cpl/OpenFst/fst/lib/libfst.so -lpthread |
CyrilAllauzen | 27 Nov 2007 - 15:37 |
| Thank you so much Cyril -- this solved the problem! Best Roger |
RogerLevy | 27 Nov 2007 - 19:59 |
| I have uploaded a very preliminary patch that autoconfiscates OpenFst? . There are four files, a configure.ac, a Makefile.am, and two template files to generate test shell scripts. Suppose the patch and uncompressed/untarred OpenFst? are in directory ~/stage, then, assuming one has a recent autoconf, automake, and libtool installed and one wishes to install to a subdirectory ~/usr of one's home directory $ cd ~/stage/OpenFst; patch < ../openfst_auto_20071112.diff $ autoreconf --install $ ./configure --prefix=$HOME/usr $ make $ make install will install OpenFst? . I have the impression that the headers go to ~/usr/include/fst/lib, the executables, which all begin with "fst", go to ~/usr/bin, and the library libfst goes to ~/usr/lib. To run tests for whether dlopen() works $ make test I have tested the patch only on Mac OS 10.4. I have tested building in a directory separate from the source directory. I have not tested the installed executables. Is libfstmain also supposed to be accessible after installation? Right now I believe the patch statically links it into the executables. I have not extended the patch to work on Cygwin. Right now the patch has no host dependent code, but for Cygwin I believe I would be forced to use AC_CANONICAL_HOST and then test for Cygwin so that the proper flags are passed to produce dynamic modules and not static libraries. I am willing to keep working on the patch to fill your requirements. Right now the patch does not recursively run make on subdirectories as there are only three subdirectories to process. I also disclaim any copyright on the patch--the patch is completely in the public domain. |
DavidShao | 13 Nov 2007 - 00:30 |
| What happens when you construct a VectorFst from some lazy machine, like in the following example: Fst* composed = new ComposeFst(fst1, fst2); VectorFst fst3(*composed); Is the result fst3 still a lazy machine? The background: I need to pass a machine into a function that needs a MutableFst. I can't pass ComposeFst or Fst directly, so I need to transform it to VectorFst, but want to be sure that the states are still only expanded lazily. Thanks! Markus |
MarkusD | 08 Nov 2007 - 13:46 |
| No, fst3 is not a lazy machine. The constructor of VectorFst will expand the lazy machine. Yes, you need to copy a ComposeFst into a VectorFst before passing it as an argument to a function that needs a MutableFst. If a function (provided by the OpenFst library) requires its argument to be a MutableFst (or ExpandedFst) that means that the underlying algorithm only works with on expanded machine. Best, Cyril |
CyrilAllauzen | 09 Nov 2007 - 13:58 |
| Hi, I am starting to work on porting openFst to Windows/VisualC++. Is anyone interested or already working on this subject ? Thank you, Chris |
ChristopherKermorvant | 30 Oct 2007 - 07:10 |
| I am generally interested in seeing OpenFst? ported to Windows, especially if the result remains free, open-source and Apache-licensed. Please keep us informed. Ken |
KennethRBeesley | 25 Sep 2008 - 16:47 |
| Hi all, I have a FST and a training data set (input and output pairs). How should I evaluate the weights for the arcs? I want to count the number of times each arc has been crossed. But each time I compose the FST with the input, I get a new FST and the stateID would change. And I will not know which arc it corresponds to in the original FST. How should I deal with this problem? Many thanks in advance! Wei |
WeiChen | 04 Oct 2007 - 09:10 |
| Hi all, While testing some very simple examples I came across something which confused me w.r.t. the behavior of fstminimize: Running fstrmepsilon on the output of fstminimize can actually produce a more "minimal" FST in some situations. Is this to be expected? Small example: #!/bin/bash function draw_fst() { fst_file="$1" dot_file="${1%%.fst}.dot" ps_file="${1%%.fst}.ps" fstdraw --title="$fst_file" "$fst_file" "$dot_file" dot -Tps "$dot_file" >"$ps_file" gv "$ps_file" & } cat < 0 1 a x 1 2 b y 2 EOF cat < a 1 b 2 EOF cat < x 1 y 2 EOF fstcompile \ --isymbols=a.isyms \ --osymbols=a.osyms \ --keep_isymbols \ --keep_osymbols \ a.txt a.fst fstunion a.fst a.fst union.fst draw_fst union.fst fstminimize union.fst minimize.fst draw_fst minimize.fst fstrmepsilon minimize.fst rmepsilon.fst draw_fst rmepsilon.fst Cheers, Andy |
AndySchlaikjer | 19 Sep 2007 - 20:05 |
| Seems as though the code got a little mangled in the original post. Each of the lines beginning with "cat " is missing the "here-document" operator (two less-than signs + "EOF").. Hope it makes sense. | AndySchlaikjer | 19 Sep 2007 - 20:08 |
| This is indeed completely expected. 1. Determinize and Minimize treats epsilon as a regular symbol. Hence, if the input has epsilon transitions, the result is not minimal if you interpret epsilon as the empty word (i.e. true epsilon). 2. Assuming that the input has no epsilon transitions, Minimize will produce the minimal deterministic automaton equivalent to the input. There might be non-deterministic automata with fewer states however. What you observe is a combination of 1+2 since epsilon-removal indeed interprets epsilon as the empty word and produce a non-deterministic machine. Best, Cyril |
CyrilAllauzen | 20 Sep 2007 - 13:15 |
| Ah, i see now. Union was producing a non-deterministic machine for this example, and i was handing that over to Minimize which actually requires as input a deterministic machine. This was made clearer when I tried to run the sequence "Union, Epsnormalize, Minimize" and Minimize produced the error "FATAL: Input Fst is not deterministic". So it seems as though there are two ways to deal with this: (1) "Union, Epsnormalize, Determinize, Minimize", to ensure both Determinize and Minimize wouldn't encounter epsilon symbols, or (2) "Union, Minimize, Rmepsilon", though I'm not convinced this second one will always produce equivalent output.. So are there other situations in which it can be useful (perhaps for efficiency?) to run Determinize and Minimize algorithms on machines containing "partial" non-determinism due to epsilon edges? Thanks, Andy |
AndySchlaikjer | 20 Sep 2007 - 14:31 |
| In (1), no need to use EpsNormalize, you can simply use RmEpsilon instead. If you follow (2) by Determinize and Minimize you should get the same result as (1). Indeed, in some situation you cannot remove epsilons because it will lead to a space blow up. In that case, you might still want to optimize the machine with epsilons using determinization and minimization. Best, Cyril |
CyrilAllauzen | 21 Sep 2007 - 11:52 |
| Hello, I have an FST with log-arcs (an acceptor). I use ShortestDistance? to compute the sum of its paths from initial state to all final states, and it somewhere around 0. However, after I run fstdeterminize, the sum of paths becomes -0.26 instead of the value close to 0. Shouldn't fstdeterminize maintain the value of sum of paths? Thanks Jerr. |
JerrRo | 05 Sep 2007 - 15:43 |
| Indeed, fstdeterminize should maintain the value of the sum of all paths. Are you doing anything else than determinization (no minimization for instance)? And is everything indeed done in the log semiring? The issue might be due to numerical instabilities. One solution would then be to use a smaller delta (in DeterminizeFstOptions? ). Is your machine cyclic? Is the size of the determinized machine much larger than the original machine? The issue might then be that the machine is not determinizable and that fstdetermize terminates because of the numerical approximations. It is unlikely but possible. In the log semiring, a cause of non-determinizablility is having two states q and q' both accessible from the initial state with paths labeled with the same input string x and such that q and q' both admits cycles labeled with the same input string y and the weight of that cycles are not equal. |
CyrilAllauzen | 09 Sep 2007 - 14:10 |
| Hi, running make on a 64-bit machine does not work. Adding -fPIC to the CFLAGS in the Makefiles of bin, lib and test solves that problem. Cheers, Christian |
ChristianKofler | 04 Sep 2007 - 11:23 |
| Suppose I have a subroutine that creates an empty transducer from heap memory, populates it somehow according to passed-in arguments, and passes back a ptr to it StdVectorFst * makeFst( ... params ...) { StdVectorFst? *newfstp = new StdVectorFst? () ; // get ptr to an empty FST // add arcs, specify initial and final states return newfstp ; } And assume that in the calling program I eventually want to delete it. Is it as simple as calling 'delete' ; StdVectorFst *ptr = makeFst(...args...) ; // do something with the Fst, perhaps mutate it // until I'm done with it delete ptr ; // will this work as expected and avoid memory leaks? Thanks, Ken |
KennethRBeesley | 23 Aug 2007 - 17:59 |
| Yes, this will work. | CyrilAllauzen | 23 Aug 2007 - 18:24 |
| 1. Is there a general crossproduct algorithm that takes two acceptor networks and produces a tranducer that encodes the crossproduct? 2. How about complement/negation? I presume that it would be limited to unweighted networks? Thanks, Ken |
KennethRBeesley | 23 Aug 2007 - 13:03 |
| 1. You simply needs to create a transducer T with one unique state and a transition with labels (a,b) for each input symbol a and each output symbol b. You can then use composition: A o T o B would represent the cross product. 2. Complement is currently a library internal-only operation. It is limited to unweighted deterministic automata. |
CyrilAllauzen | 23 Aug 2007 - 18:10 |
| 2. Forgot to mention that you should use Difference instead of Complement. | CyrilAllauzen | 25 Aug 2007 - 13:14 |
| You confirmed earlier that in Symbol Table Files that map symbols to internal integer labels, one could simply use real Unicode code point values for the integers. That was good news, but it leads to a few questions: 1. In the old AT&T documentation I read "Some FSM operations allocate internal arrays based on the maximum inte- ger used as an input or output arc symbol. This design choice, chosen for efficiency, requires the user to avoid huge integer labels (e.g., INT_MAX) since memory may otherwise be exhausted." Is that also the case with OpenFst? ? If so, it would be a major disincentive to using real code point values, especially if (like myself) you work with characters encoded in the Unicode supplementary area. Comments? 2. If you are using Supplementary Chars, or even Greek, Cyrillic, Hebrew, etc., then symbol table files could intuitively contains lines such as the following (for the Deseret Alphabet): ?? 66560 ?? 66561 ?? 66562 ?? 66563 ?? 66564 ?? 66565 etc. Using such a symbol table, encoded in Unicode UTF-8, I tried 'fstcompile' on the following file, also encoded in UTF-8 0 1 a b 0.5 1 2 ?? ?? 0.5 2 3 x y 0.5 3 1.0 fstcompile choked on line 2, which contains the first characters in this file outside the ASCII range. fstcompile has many options, but I haven't seen anything (yet) that would cause it to handle the text input and symbol-table files as UTF-8. Have I missed something? 3. (Previously mentioned topic) If one does use real Unicode code-point values for the label integers, it would be convenient to allow them to be represented, in the symbol-table files, in hex, e.g. ?? 0x10400 ?? 0x10401 ?? 0x10402 ?? 0x10403 ?? 0x10404 ?? 0x10405 etc. Comments would be welcome. Thanks, Ken |
KennethRBeesley | 23 Aug 2007 - 12:56 |
| I see that the Unicode chars did not survive the reposting process in my question above. When pasted in, each ?? was a Unicode char. Ken |
KennethRBeesley | 23 Aug 2007 - 13:05 |
| The library can only handle ascii for symbol files and fst text format. Note that the use of a symbol file is not required at all. You can directly define your fst using the "code-point value". Or write a script that turn the UTF-8 characters into their "code-point value". |
CyrilAllauzen | 24 Aug 2007 - 09:30 |
| Thanks for the response. I see that I asked too many questions in one posting. Sorry. Here's Question #1 again: In the old AT&T documentation I read "Some FSM operations allocate internal arrays based on the maximum inte- ger used as an input or output arc symbol. This design choice, chosen for efficiency, requires the user to avoid huge integer labels (e.g., INT_MAX) since memory may otherwise be exhausted." Is that also the case with OpenFst? ?? If so, it could be a major disincentive to using real code point values as labels, especially if (like myself) you work with characters encoded in the Unicode supplementary area. Thanks, Ken |
KennethRBeesley | 24 Aug 2007 - 18:18 |
| No, it is not the case for OpenFst. The library does not allocate such arrays. | CyrilAllauzen | 29 Aug 2007 - 11:21 |
| Hi, If I am not mistaken, when calculating shortest distance with reverse=true (say, in the log semiring), meaning - the backward values, the weight for states which have no access to the final state should be LogWeight? ::Zero (or +inf). However, when I did that for a machine of the following 0-1> 0->2 1 is final, 2 is not final, I got from ShortestDistance? LogWeight? ::One() (very small value close to 0) when calculating shortestdistance with reverse=true for state 2. Am I misunderstanding something? Thanks. Jerr. |
JerrRo | 17 Aug 2007 - 13:28 |
You should indeed get LogWeight::Zero() for state 2. However, the size of the vector returned by ShortestDistance can be less than NumStates(),i.e, it is the maximum state visited as mentioned here.If a state i is such that i < distance.size(), then its shortest distance is distance[i] otherwise it is Weight::Zero().In the example you gave, I suspect that distance.size() is 1 since 2 would likely not be visited by ShortestDistance when reverse is true.Best, Cyril |
CyrilAllauzen | 17 Aug 2007 - 16:45 |
| I am trying to compose (programmatically) a transducer with itself several times. The FST is a result of many unions. The composition of that FST with itself takes a long time. I am trying to figure out what could be a good way to do that efficiently. Is composition slower with many epsilon moves? Would it be recommended to first determinize the FST and remove epsilon moves before composing it with itself (the original FST has many epsilon moves, being a result of a union)? | JerrRo | 14 Aug 2007 - 14:55 |
| Indeed, union introduces epsilon-transitions and epsilon-transitions will slow down composition. Non-determinism will also slow down composition. So, I would recommend you first epsilon-remove and then determinize your fst. However, there is always a risk with determinization (it might blow up or even not terminate). Hence, if what I suggest does not work better, you should investigate using determinize only, or rmepsilon only or determinize followed by rmepsilon. Cyril |
CyrilAllauzen | 14 Aug 2007 - 15:21 |
| Thanks for your quick response. I am using a different semi-ring I defined for the composition. For some odd reason, when trying to determinize the FST I get the error: FATAL: StringWeight? ::Plus: unequal arguments (non-functional FST?) I am not sure what it means. I am guessing the mention of StringWeight? is an error (in the error :)). Thanks. Jerr |
JerrRo | 14 Aug 2007 - 15:37 |
| The library currently only supports the determinization of functional transducers (if two successfull paths have the same input label, they need to also have the same output label). The reason for that is that we use the weighted automata determinization algorithm viewing the output labels as weights in the string semiring (hence StringWeight? in the error message). A workaround is to use Encode/Decode to view the transducer as an acceptor, considering the pair (ilabel, olabel) as one symbol. 1. Encode: EncodeMapper encoder(kEncodeLabels, ENCODE); Encode(fst, &encoder); 2. Determinize. 3. Decode: Decode(fst, encoder); |
CyrilAllauzen | 14 Aug 2007 - 16:16 |
| Is it possible that RmEpsilonFst? /ComposeFst have a bug? When I tried creating an RmEpsilon? FST, I got SIGABRT later when trying to compose it with another machine. This means the following doesn't work: RmEpsilonFst ComposeFst (get here SIGABRT - probably because of double free/delete) while this works: RmEpsilonFst ComposeFst (just changed compose line, even keep the rm epsilon without using the resulting FST). MyArc/MyWeight are debugged quite well (I use them in so many other places, and they seem to be working well), so I suspect the problem is not there. I will try to check the FST library code, to see if anything pops out immediately, but I suspect that won't happen. Thanks. Jerr. |
JerrRo | 14 Aug 2007 - 17:08 |
| Thanks Jerr., I will look into that. In the meantime, I recommend you use the destructive version of epsilon-removal: RmEpsilon(&A). It is also more efficient in general. Cyril |
CyrilAllauzen | 14 Aug 2007 - 20:12 |
| Just to make sure I got it right: 2. Determinize. Here I should output the fst to some new MutableFst? (maybe it is even possible to call Determinize(myFst, &myFst) without fear?) 3. Decode: Decode(fst, encoder); Here I should only decode the older FST (not the new MutableFst? ) to restore it to its previous state. The new determinized MutableFst? will be already decoded, so there is no need to change it. Thanks. Jerr. |
JerrRo | 15 Aug 2007 - 08:03 |
| 2. Both should work. 3. You want to decode the result of determinization (using the same encoder as for encoding). You want to decode the original fst only if you still need to use it. Encode and Decode take as argument pointers to MutableFst. |
CyrilAllauzen | 15 Aug 2007 - 09:44 |
| If I've constructed an fst and can write it out in binary form using fst.Write("binary.fst") how can I write out the same FST in AT&T FSM format? Thanks, Ken |
KennethRBeesley | 13 Aug 2007 - 15:51 |
The library only supports writing fsts in binary format, since this is the "working" format that all the utilities use. The command-line utility fstprint reads a binary fst and writes out its textual (AT&T FSM format) representation.Best, Cyril |
CyrilAllauzen | 14 Aug 2007 - 11:02 |
| In the context of a GUI/IDE built on top of OpenFst? , it might well be useful to have the library support 1) writing out an fst in its textual (AT&T FSM Format) representation, and 2) drawing the fst (via DOT) Any chance of adding such support to future versions of the library? Thanks, Ken |
KennethRBeesley | 21 Aug 2007 - 14:13 |
| Maybe. As of now, nothing prevents someone from simply using the FstPrinter and FstDrawer classes (from fst/bin/print-main.h and fst/bin/draw-main.h) by copy-pasting them into one's code. |
CyrilAllauzen | 24 Aug 2007 - 09:34 |
| Hello, I am unioning several thousand of FSTs together... When I use the final UnionFst? to create a VectorFst? (new VectorFst? <>(unioned_fst)), it takes a long time to actually create that FST and it also takes a lot of memory. Is there a specific reason for such process to take such a long time when moving from UnionFst? to VectorFst? ? Thanks. Jerr. |
JerrRo | 13 Aug 2007 - 15:47 |
| The reason is that UnionFst is a delayed fst. That means it is only built when visited, i.e., it's only when converting to VectorFst that the actual union is computed. It will probably be more efficient to convert each intermediary UnionFst to VectorFst. In most cases, a very efficient way will be to apply epsilon-removal and determinization after each union. Best, Cyril |
CyrilAllauzen | 14 Aug 2007 - 11:11 |
| Thanks! | JerrRo | 14 Aug 2007 - 14:55 |
| Hello, After minimizing and determinizing a machine programmatically (using DeterminizeFst? and Minimize) I get all kind of isolated states that can never be reached. However, when I use fstminimize and fstdeterminize, this does not happen. Is there a way to get rid of these states programmatically? Right now, I am pretty much doing the same MinimizeMain? and DeterminizeMain? do, and still I get these isolated states. Thanks. Jerr. |
JerrRo | 09 Aug 2007 - 12:53 |
| You can get rid of theses states by using Connect(&mutable_fst). However, it is not clear to me why these non-accessible states appear. I would need to know more precisely what you are doing. Best, Cyril |
CyrilAllauzen | 09 Aug 2007 - 14:38 |
| The Quick Tour indicates, for symbol-table files: "You may use any string for a label" Question: Are there any restrictions or conventions on these strings? Question: If a string label contains more than one character, are there cases in the code where input strings must be 'tokenized' into individual labels based on the labels defined in the symbol-table files? If so, how is such tokenization performed. Question: How have multicharacter labels been used traditionally in the AT&T/OpenFst tradition? Examples? Thanks, Ken |
KennethRBeesley | 02 Aug 2007 - 14:00 |
| I guess these strings should not contain spaces, tabs and newlines. The symbol table does not play any role internally. It is only used for displaying and printing. No tokenization is performed. A multicharacter string is always treated as an indivisible label in the library. An example of multicharacter labels is the example given in the top banner of this page. More seriously, you can check out this paper for an example of multicharacter labels when building a language model. Best, Cyril |
CyrilAllauzen | 02 Aug 2007 - 14:26 |
| How would a literal space be represented in an AT&T FSM format file, and in a symbol-table file? Thanks, Ken |
KennethRBeesley | 23 Aug 2007 - 13:08 |
| It cannot. You would need to replace by an other symbol. As I was saying before, a symbol list is not required. I created fsts using ascii codes for labels. And in that case, it is simply better not to use a symbol list to create the fsts. |
CyrilAllauzen | 24 Aug 2007 - 09:39 |
| Hello, I have an FST with LogWeight? arcs. I would like to access these arcs and change them programatically. I wanted to do it using ArcIterator? , but Value() returns the arc as const. Is there other way to change the weight on an arc? thanks. jerr |
JerrRo | 01 Aug 2007 - 16:02 |
| You need to use MutableArcIterator instead of ArcIterator. You can then use the SetValue method to modify the arc pointed to by the iterator. Cyril |
CyrilAllauzen | 01 Aug 2007 - 17:23 |
| The OpenFst? Quick Tour indicates, for symbol table files, that "You may use any string for a label; you may use any non-negative integer for a label ID. The zero label is reserved for the epsilon label." OK, but just to make sure, I have a few questions: Question: Both examples use a dense range 0-3 of integers. Is this required or recommended? Question: Could one just use Unicode code point values as the integers? Question: Can one indicate the integer values in hex notation in the symbol-table file(s)? Question: What is the maximum integer value? Can it handle Unicode supplementary code point values? up to 0x10FFFF? Thanks, Ken |
KennethRBeesley | 26 Jul 2007 - 00:47 |
| You don't have to use dense range of integers. You can indeed use Unicode values as the integers. No, you cannot use hex notation in the symbol table file. The maximum integer value depends on the Arc::Label type. For StdArc? , Label is defined as int (signed 32 bits on most machines). Hence, 0x10FFFF should be fine. You can also define your own Arc type if you wish. Best, Cyril |
CyrilAllauzen | 29 Jul 2007 - 15:12 |
| We use strtoll(...,10) for reading integers. I guess we didn't use strtoll(,...0) (which would allow 012 or 0x12) because people might inadventedly have leading zeroes but want base 10. I could be convinced to change this if people feel strongly about it. -Michael |
MichaelRiley | 30 Jul 2007 - 19:51 |
| Cyril/Michael, Many thanks for your responses, and in general for making OpenFst? available. I (and I assume many others) are moving consistently toward using Unicode wherever possible for internal character representations, and in Unicode, Hex is the De Facto King. Having to use decimal for Unicode code point values is a nuisance and leads to errors. While I recognize your concern about those who might "inadvertently have leading zeroes but want base 10", I suspect that most of us are pretty well acquainted with the 12 vs. 012 vs. 0x12 distinction. Possibilities: 1. Change to using strtoll(...,0) to allow 12, 012 and 0x12. Someone might write a trivial symbolTableLint program to scan the files and print warnings if both decimal and (apparently) octal representations are found in the same file. 2. Or better (?), provide some kind of optional header in the symbol-table file, or some other convenient user-specified flag, to indicate that strtoll(...,0) should be used instead of (the default) strtoll(...,10). That would maintain backward/legacy compability; and if I use 0x12 without overtly indicating that strtoll(...,0) should be used, then presumably I'd get a helpful compile-time error. What do you think? Ken |
KennethRBeesley | 02 Aug 2007 - 13:39 |
| I'm running Mac OS X 10.4.10. I downloaded OpenFst? beta and edited bin/Makefile, bin/lib and bin/text, making the changes indicated for macosx. 'make all' seemed to work. But 'make test' yields: [titania:openfst/OpenFst/fst] beesley% make test ( cd lib ; make all ) make[1]: Nothing to be done for `all'. ( cd bin ; make all ) make[1]: Nothing to be done for `all'. (cd test ; make test ) g++ -I../.. -O2 -o weight_test.o -c weight_test.cc g++ -Wl,-L/Users/beesley/fsimpls/openfst/OpenFst/fst/test/../lib -o weight_test weight_test.o -lfst -lm -lpthread -ldl ./weight_test # Tests Weight classes dyld: Library not loaded: libfst.dylib Referenced from: /Users/beesley/fsimpls/openfst/OpenFst/fst/test/./weight_test Reason: image not found make[1]: * [test_weights] Trace/BPT trap make: * [test] Error 2 Question: what does this error indicate? and how can I fix it? Thanks, Ken |
KennethRBeesley | 26 Jul 2007 - 00:10 |
You need to add the paths to fst/lib, fst/bin and fst/test to your DYLD_LIBRARY_PATH environment variable:export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:/Users/beesley/fsimpls/openfst/OpenFst/fst/test:/Users/beesley/fsimpls/openfst/OpenFst/fst/bin:/Users/beesley/fsimpls/openfst/OpenFst/fst/lib |
CyrilAllauzen | 29 Jul 2007 - 14:48 |
| I | Attachment | Action | Size | Date | Who | Comment |
|---|---|---|---|---|---|---|
| |
label-pushing.jpg | manage | 6.8 K | 15 Oct 2009 - 20:30 | CyrilAllauzen | Label pushing example: result of label pushing as an automaton over the string semiring |
| |
openfst_auto_20071112.diff | manage | 8.7 K | 13 Nov 2007 - 05:13 | DavidShao | Autoconfiscating patch |
| |
reverse-label-pushing.jpg | manage | 10.9 K | 15 Oct 2009 - 20:31 | CyrilAllauzen | Label pushing example: reverse, label-pushing towards initial state, reverse |