/* * File: lexicon.cpp * ----------------- * A lexicon is a word list. This lexicon is backed by two separate data * structures for storing the words in the list: * * 1) a DAWG (directed acyclic word graph) * 2) a Set of other words. * * Typically the DAWG is used for a large list read from a file in binary * format. The STL set is for words added piecemeal at runtime. * * The DAWG idea comes from an article by Appel & Jacobson, CACM May 1988. * This lexicon implementation only has the code to load/search the DAWG. * The DAWG builder code is quite a bit more intricate, see me (Julie) * if you need it. */ #include #include #include #include #include #include #include #include "error.h" #include "lexicon.h" #include "strlib.h" using namespace std; static void toLowerCaseInPlace(string & str); /* * The DAWG is stored as an array of edges. Each edge is represented by * one 32-bit struct. The 5 "letter" bits indicate the character on this * transition (expressed as integer from 1 to 26), the "accept" bit indicates * if you accept after appending that char (current path forms word), and the * "lastEdge" bit marks this as the last edge in a sequence of childeren. * The bulk of the bits (24) are used for the index within the edge array for * the children of this node. The children are laid out contiguously in * alphabetical order. Since we read edges as binary bits from a file in * a big-endian format, we have to swap the struct order for little-endian * machines. */ Lexicon::Lexicon() { edges = start = NULL; numEdges = numDawgWords = 0; } Lexicon::Lexicon(string filename) { edges = start = NULL; numEdges = numDawgWords = 0; addWordsFromFile(filename); } Lexicon::~Lexicon() { if (edges) delete[] edges; } /* * Swaps a 4-byte long from big to little endian byte order */ static uint32_t my_ntohl(uint32_t arg) { uint32_t result = ((arg & 0xff000000) >> 24) | ((arg & 0x00ff0000) >> 8) | ((arg & 0x0000ff00) << 8) | ((arg & 0x000000ff) << 24); return result; } /* * Implementation notes: readBinaryFile * ------------------------------------ * The binary lexicon file format must follow this pattern: * DAWG::: */ void Lexicon::readBinaryFile(string filename) { long startIndex, numBytes; char firstFour[4], expected[] = "DAWG"; ifstream istr(filename.c_str(), IOS_IN | IOS_BINARY); if (false) my_ntohl(0); if (istr.fail()) { error("Couldn't open lexicon file " + filename); } istr.read(firstFour, 4); istr.get(); istr >> startIndex; istr.get(); istr >> numBytes; istr.get(); if (istr.fail() || strncmp(firstFour, expected, 4) != 0 || startIndex < 0 || numBytes < 0) { error("Improperly formed lexicon file " + filename); } numEdges = numBytes/sizeof(Edge); edges = new Edge[numEdges]; start = &edges[startIndex]; istr.read((char *)edges, numBytes); if (istr.fail() && !istr.eof()) { error("Improperly formed lexicon file " + filename); } #if defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN uint32_t *cur = (uint32_t *) edges; for (int i = 0; i < numEdges; i++, cur++) { *cur = my_ntohl(*cur); } #endif istr.close(); numDawgWords = countDawgWords(start); } int Lexicon::countDawgWords(Edge *ep) const { int count = 0; while (true) { if (ep->accept) count++; if (ep->children != 0) { count += countDawgWords(&edges[ep->children]); } if (ep->lastEdge) break; ep++; } return count; } /* * Check for DAWG in first 4 to identify as special binary format, * otherwise assume ASCII, one word per line */ void Lexicon::addWordsFromFile(string filename) { char firstFour[4], expected[] = "DAWG"; ifstream istr(filename.c_str()); if (istr.fail()) { error("Couldn't open lexicon file " + filename); } istr.read(firstFour, 4); if (strncmp(firstFour, expected, 4) == 0) { if (otherWords.size() != 0) { error("Binary files require an empty lexicon"); } readBinaryFile(filename); return; } istr.seekg(0); string line; while (getline(istr, line)) { add(line); } istr.close(); } int Lexicon::size() const { return numDawgWords + otherWords.size(); } bool Lexicon::isEmpty() const { return size() == 0; } void Lexicon::clear() { if (edges) delete[] edges; edges = start = NULL; numEdges = numDawgWords = 0; otherWords.clear(); } /* * Implementation notes: findEdgeForChar * ------------------------------------- * Iterate over sequence of children to find one that * matches the given char. Returns NULL if we get to * last child without finding a match (thus no such * child edge exists). */ Lexicon::Edge *Lexicon::findEdgeForChar(Edge *children, char ch) const { Edge *curEdge = children; while (true) { if (curEdge->letter == charToOrd(ch)) return curEdge; if (curEdge->lastEdge) return NULL; curEdge++; } } /* * Implementation notes: traceToLastEdge * ------------------------------------- * Given a string, trace out path through the DAWG edge-by-edge. * If a path exists, return last edge; otherwise return NULL. */ Lexicon::Edge *Lexicon::traceToLastEdge(const string & s) const { if (!start) return NULL; Edge *curEdge = findEdgeForChar(start, s[0]); int len = (int) s.length(); for (int i = 1; i < len; i++) { if (!curEdge || !curEdge->children) return NULL; curEdge = findEdgeForChar(&edges[curEdge->children], s[i]); } return curEdge; } bool Lexicon::containsPrefix(string prefix) const { if (prefix.empty()) return true; toLowerCaseInPlace(prefix); if (traceToLastEdge(prefix)) return true; foreach (string word in otherWords) { if (startsWith(word, prefix)) return true; if (prefix < word) return false; } return false; } bool Lexicon::contains(string word) const { toLowerCaseInPlace(word); Edge *lastEdge = traceToLastEdge(word); if (lastEdge && lastEdge->accept) return true; return otherWords.contains(word); } void Lexicon::add(string word) { toLowerCaseInPlace(word); if (!contains(word)) { otherWords.add(word); } } Lexicon::Lexicon(const Lexicon & src) { deepCopy(src); } Lexicon & Lexicon::operator=(const Lexicon & src) { if (this != &src) { if (edges != NULL) delete[] edges; deepCopy(src); } return *this; } void Lexicon::deepCopy(const Lexicon & src) { if (src.edges == NULL) { edges = NULL; start = NULL; } else { numEdges = src.numEdges; edges = new Edge[src.numEdges]; memcpy(edges, src.edges, sizeof(Edge)*src.numEdges); start = edges + (src.start - src.edges); } numDawgWords = src.numDawgWords; otherWords = src.otherWords; } void Lexicon::mapAll(void (*fn)(string)) const { foreach (string word in *this) { fn(word); } } void Lexicon::mapAll(void (*fn)(const string &)) const { foreach (string word in *this) { fn(word); } } void Lexicon::iterator::advanceToNextWordInSet() { if (setIterator == setEnd) { currentSetWord = ""; } else { currentSetWord = *setIterator; ++setIterator; } } void Lexicon::iterator::advanceToNextWordInDawg() { if (edgePtr == NULL) { edgePtr = lp->start; } else { advanceToNextEdge(); } while (edgePtr != NULL && !edgePtr->accept) { advanceToNextEdge(); } } void Lexicon::iterator::advanceToNextEdge() { Edge *ep = edgePtr; if (ep->children == 0) { while (ep != NULL && ep->lastEdge) { if (stack.isEmpty()) { edgePtr = NULL; return; } else { ep = stack.pop(); currentDawgPrefix.resize(currentDawgPrefix.length() - 1); } } edgePtr = ep + 1; } else { stack.push(ep); currentDawgPrefix.push_back(lp->ordToChar(ep->letter)); edgePtr = &lp->edges[ep->children]; } }; static void toLowerCaseInPlace(string & str) { int nChars = str.length(); for (int i = 0; i < nChars; i++) { str[i] = tolower(str[i]); } }