7 #ifndef STANDARDTOKENIZER_H
8 #define STANDARDTOKENIZER_H
70 static const int32_t
NUM;
71 static const int32_t
CJ;
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Version
Definition: Constants.h:40
A grammar-based tokenizer.
Definition: StandardTokenizer.h:34
static const int32_t ACRONYM_DEP
Definition: StandardTokenizer.h:74
static const Collection< String > TOKEN_TYPES()
String token types that correspond to token type int constants.
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
StandardTokenizerImplPtr scanner
A private instance of the scanner.
Definition: StandardTokenizer.h:48
StandardTokenizer(LuceneVersion::Version matchVersion, const ReaderPtr &input)
Creates a new instance of the StandardTokenizer. Attaches the input to the newly created scanner.
static const int32_t CJ
Definition: StandardTokenizer.h:71
static const int32_t ACRONYM
Definition: StandardTokenizer.h:66
virtual ~StandardTokenizer()
void setReplaceInvalidAcronym(bool replaceInvalidAcronym)
int32_t maxTokenLength
Definition: StandardTokenizer.h:55
void init(const ReaderPtr &input, LuceneVersion::Version matchVersion)
bool isReplaceInvalidAcronym()
void setMaxTokenLength(int32_t length)
Set the max allowed token length. Any token longer than this is skipped.
bool replaceInvalidAcronym
Definition: StandardTokenizer.h:54
int32_t getMaxTokenLength()
static const int32_t COMPANY
Definition: StandardTokenizer.h:67
static const int32_t HOST
Definition: StandardTokenizer.h:69
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeFactoryPtr &factory, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.AttributeFactory.
TypeAttributePtr typeAtt
Definition: StandardTokenizer.h:61
StandardTokenizer(LuceneVersion::Version matchVersion, const AttributeSourcePtr &source, const ReaderPtr &input)
Creates a new StandardTokenizer with a given AttributeSource.
TermAttributePtr termAtt
Definition: StandardTokenizer.h:58
static const int32_t APOSTROPHE
Definition: StandardTokenizer.h:65
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
static const int32_t NUM
Definition: StandardTokenizer.h:70
PositionIncrementAttributePtr posIncrAtt
Definition: StandardTokenizer.h:60
OffsetAttributePtr offsetAtt
Definition: StandardTokenizer.h:59
virtual bool incrementToken()
static const int32_t ALPHANUM
Definition: StandardTokenizer.h:64
static const int32_t EMAIL
Definition: StandardTokenizer.h:68
A Tokenizer is a TokenStream whose input is a Reader.
Definition: Tokenizer.h:20
Definition: AbstractAllTermDocs.h:12
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition: LuceneTypes.h:520
boost::shared_ptr< PositionIncrementAttribute > PositionIncrementAttributePtr
Definition: LuceneTypes.h:45
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition: LuceneTypes.h:58
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition: LuceneTypes.h:40
boost::shared_ptr< StandardTokenizerImpl > StandardTokenizerImplPtr
Definition: LuceneTypes.h:53
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition: LuceneTypes.h:519
boost::shared_ptr< TypeAttribute > TypeAttributePtr
Definition: LuceneTypes.h:64