boggle/lib/StanfordCPPLib/lexicon.h
2024-09-11 17:33:32 +02:00

367 lines
9.5 KiB
C++
Executable File

/*
* File: lexicon.h
* ---------------
* This file exports the <code>Lexicon</code> class, which is a
* compact structure for storing a list of words.
*/
#ifndef _lexicon_h
#define _lexicon_h
#include <string>
#include "foreach.h"
#include "set.h"
#include "stack.h"
/*
* Class: Lexicon
* --------------
* This class is used to represent a <b><i>lexicon,</i></b> or word list.
* The main difference between a lexicon and a dictionary is that
* a lexicon does not provide any mechanism for storing definitions;
* the lexicon contains only words, with no associated information.
* It is therefore similar to a set of strings, but with a more
* space-efficient internal representation. The <code>Lexicon</code>
* class supports efficient lookup operations for words and prefixes.
*
* <p>As an example of the use of the <code>Lexicon</code> class, the
* following program lists all the two-letter words in the lexicon
* stored in <code>EnglishWords.dat</code>:
*
*<pre>
* int main() {
* Lexicon english("EnglishWords.dat");
* foreach (string word in english) {
* if (word.length() == 2) {
* cout << word << endl;
* }
* }
* return 0;
* }
*</pre>
*/
#include <cctype>
class Lexicon {
public:
/*
* Constructor: Lexicon
* Usage: Lexicon lex;
* Lexicon lex(filename);
* -----------------------------
* Initializes a new lexicon. The default constructor creates an empty
* lexicon. The second form reads in the contents of the lexicon from
* the specified data file. The data file must be in one of two formats:
* (1) a space-efficient precompiled binary format or (2) a text file
* containing one word per line. The Stanford library distribution
* includes a binary lexicon file named <code>English.dat</code>
* containing a list of words in English. The standard code pattern
* to initialize that lexicon looks like this:
*
*<pre>
* Lexicon english("English.dat");
*</pre>
*/
Lexicon();
Lexicon(std::string filename);
/*
* Destructor: ~Lexicon
* --------------------
* The destructor deallocates any storage associated with the lexicon.
*/
virtual ~Lexicon();
/*
* Method: size
* Usage: int n = lex.size();
* --------------------------
* Returns the number of words contained in the lexicon.
*/
int size() const;
/*
* Method: isEmpty
* Usage: if (lex.isEmpty()) ...
* -----------------------------
* Returns <code>true</code> if the lexicon contains no words.
*/
bool isEmpty() const;
/*
* Method: clear
* Usage: lex.clear();
* -------------------
* Removes all words from the lexicon.
*/
void clear();
/*
* Method: add
* Usage: lex.add(word);
* ---------------------
* Adds the specified word to the lexicon.
*/
void add(std::string word);
/*
* Method: addWordsFromFile
* Usage: lex.addWordsFromFile(filename);
* --------------------------------------
* Reads the file and adds all of its words to the lexicon.
*/
void addWordsFromFile(std::string filename);
/*
* Method: contains
* Usage: if (lex.contains(word)) ...
* ----------------------------------
* Returns <code>true</code> if <code>word</code> is contained in the
* lexicon. In the <code>Lexicon</code> class, the case of letters is
* ignored, so "Zoo" is the same as "ZOO" or "zoo".
*/
bool contains(std::string word) const;
/*
* Method: containsPrefix
* Usage: if (lex.containsPrefix(prefix)) ...
* ------------------------------------------
* Returns true if any words in the lexicon begin with <code>prefix</code>.
* Like <code>containsWord</code>, this method ignores the case of letters
* so that "MO" is a prefix of "monkey" or "Monday".
*/
bool containsPrefix(std::string prefix) const;
/*
* Method: mapAll
* Usage: lexicon.mapAll(fn);
* --------------------------
* Calls the specified function on each word in the lexicon.
*/
void mapAll(void (*fn)(std::string)) const;
void mapAll(void (*fn)(const std::string &)) const;
template <typename FunctorType>
void mapAll(FunctorType fn) const;
/*
* Additional Lexicon operations
* -----------------------------
* In addition to the methods listed in this interface, the Lexicon
* class supports the following operations:
*
* - Deep copying for the copy constructor and assignment operator
* - Iteration using the range-based for statement and STL iterators
*
* All iteration is guaranteed to proceed in alphabetical order. All
* words in the lexicon are stored in lowercase.
*/
/* Private section */
/**********************************************************************/
/* Note: Everything below this point in the file is logically part */
/* of the implementation and should not be of interest to clients. */
/**********************************************************************/
private:
#ifdef _WIN32
#define LITTLE_ENDIAN 1
#define BYTE_ORDER LITTLE_ENDIAN
#endif
#pragma pack(1)
struct Edge {
#if defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN
unsigned long letter:5;
unsigned long lastEdge:1;
unsigned long accept:1;
unsigned long unused:1;
unsigned long children:24;
#else
unsigned long children:24;
unsigned long unused:1;
unsigned long accept:1;
unsigned long lastEdge:1;
unsigned long letter:5;
#endif
};
#pragma pack()
Edge *edges, *start;
int numEdges, numDawgWords;
Set<std::string> otherWords;
public:
/*
* Deep copying support
* --------------------
* This copy constructor and operator= are defined to make a
* deep copy, making it possible to pass/return lexicons by value
* and assign from one lexicon to another. The entire contents of
* the lexicon, including all words, are copied. Making copies is
* generally avoided because of the expense and thus, lexicons are
* typically passed by reference. When a copy is needed, these
* operations are supported.
*/
Lexicon(const Lexicon & src);
Lexicon & operator=(const Lexicon & src);
/*
* Iterator support
* ----------------
* The classes in the StanfordCPPLib collection implement input
* iterators so that they work symmetrically with respect to the
* corresponding STL classes.
*/
class iterator : public std::iterator<std::input_iterator_tag,std::string> {
private:
const Lexicon *lp;
int index;
std::string currentDawgPrefix;
std::string currentSetWord;
std::string tmpWord;
Edge *edgePtr;
Stack<Edge *> stack;
Set<std::string>::iterator setIterator;
Set<std::string>::iterator setEnd;
void advanceToNextWordInDawg();
void advanceToNextWordInSet();
void advanceToNextEdge();
public:
iterator() {
this->lp = NULL;
}
iterator(const Lexicon *lp, bool endFlag) {
this->lp = lp;
if (endFlag) {
index = lp->size();
} else {
index = 0;
edgePtr = NULL;
setIterator = lp->otherWords.begin();
setEnd = lp->otherWords.end();
currentDawgPrefix = "";
currentSetWord = "";
advanceToNextWordInDawg();
advanceToNextWordInSet();
}
}
iterator(const iterator & it) {
lp = it.lp;
index = it.index;
currentDawgPrefix = it.currentDawgPrefix;
currentSetWord = it.currentSetWord;
edgePtr = it.edgePtr;
stack = it.stack;
setIterator = it.setIterator;
}
iterator & operator++() {
if (edgePtr == NULL) {
advanceToNextWordInSet();
} else {
if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
advanceToNextWordInDawg();
} else {
advanceToNextWordInSet();
}
}
index++;
return *this;
}
iterator operator++(int) {
iterator copy(*this);
operator++();
return copy;
}
bool operator==(const iterator & rhs) {
return lp == rhs.lp && index == rhs.index;
}
bool operator!=(const iterator & rhs) {
return !(*this == rhs);
}
std::string operator*() {
if (edgePtr == NULL) return currentSetWord;
if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
return currentDawgPrefix + lp->ordToChar(edgePtr->letter);
} else {
return currentSetWord;
}
}
std::string *operator->() {
if (edgePtr == NULL) return &currentSetWord;
if (currentSetWord == "" || currentDawgPrefix < currentSetWord) {
tmpWord = currentDawgPrefix + lp->ordToChar(edgePtr->letter);
return &tmpWord;
} else {
return &currentSetWord;
}
}
};
iterator begin() const {
return iterator(this, false);
}
iterator end() const {
return iterator(this, true);
}
private:
Edge *findEdgeForChar(Edge *children, char ch) const;
Edge *traceToLastEdge(const std::string & s) const;
void readBinaryFile(std::string filename);
void deepCopy(const Lexicon & src);
int countDawgWords(Edge *start) const;
unsigned int charToOrd(char ch) const {
return ((unsigned int)(tolower(ch) - 'a' + 1));
}
char ordToChar(unsigned int ord) const {
return ((char)(ord - 1 + 'a'));
}
};
template <typename FunctorType>
void Lexicon::mapAll(FunctorType fn) const {
foreach (std::string word in *this) {
fn(word);
}
}
// hashing functions for lexicons; defined in hashmap.cpp
int hashCode(const Lexicon& l);
#endif