class Cadmium::PragmaticTokenizer

Overview

This tokenizer is based off of the pragmatic_tokenizer ruby gem. It is much more robust than any of the other tokenizers, but has more features than you'll need for most use cases.

Constructor Options

Examples

tokenizer = Cadmium::Tokenizers::Pragmatic.new
tokenizer.tokenize("Hello world.")
# => ["hello", "world", "."]

tokenizer.tokenize("Jan. 2015 was 20% colder than now. But not in inter- and outer-space.")
# => ["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."]

tokenizer.contractions = {"supa'soo" => "super smooth"}
tokenizer.expand_contractions = true
tokenizer.tokenize("Hello supa'soo guy.")
# => ["hello", "super", "smooth", "guy", "."]

tokenizer.clean = true
tokenizer.tokenize("This sentence has a long string of dots .......................")
# => ["this", "sentence", "has", "a", "long", "string", "of", "dots"]

Defined in:

cadmium/tokenizer/pragmatic/languages.cr
cadmium/tokenizer/pragmatic/languages/bulgarian.cr
cadmium/tokenizer/pragmatic/languages/common.cr
cadmium/tokenizer/pragmatic/languages/czech.cr
cadmium/tokenizer/pragmatic/languages/deutsch.cr
cadmium/tokenizer/pragmatic/languages/english.cr
cadmium/tokenizer/pragmatic/languages/portuguese.cr
cadmium/tokenizer/pragmatic/languages/spanish.cr
cadmium/tokenizer/pragmatic/post_processor.cr
cadmium/tokenizer/pragmatic/pre_processor.cr
cadmium/tokenizer/pragmatic/regex.cr
cadmium/tokenizer/pragmatic_tokenizer.cr

Constant Summary

DOT = "."
MAX_TOKEN_LENGTH = 50
NOTHING = ""
SINGLE_QUOTE = "'"
SPACE = " "

Constructors

Instance Method Summary

Instance methods inherited from class Cadmium::Tokenizer

tokenize(string : String) : Array(String) tokenize, trim(arr) trim

Constructor Detail

def self.new(*, language = :en, abbreviations = Set(String).new, stop_words = Set(String).new, contractions = {} of String => String, filter_languages = [] of String | Symbol, hashtags : MentionsOptions = :keep_original, mentions : MentionsOptions = :keep_original, punctuation : PunctuationOptions = :all, numbers : NumbersOptions = :all, expand_contractions : Bool = false, remove_stop_words : Bool = false, remove_emoji : Bool = false, remove_emails : Bool = false, remove_urls : Bool = false, remove_domains : Bool = false, clean : Bool = false, classic_filter : Bool = false, downcase : Bool = true, minimum_length : Int32 = 0, long_word_split : Int32 = 0) #

Creates a new Pragmatic tokenizer.


[View source]

Instance Method Detail

def abbreviations : Set(String) #

Set of recognized abbreviations


[View source]
def abbreviations=(abbreviations : Set(String)) #

Set of recognized abbreviations


[View source]
def classic_filter : Bool #

Run the classic filter?


[View source]
def classic_filter=(classic_filter : Bool) #

Run the classic filter?


[View source]
def clean : Bool #

Run the cleaner after we've tokenized?


[View source]
def clean=(clean : Bool) #

Run the cleaner after we've tokenized?


[View source]
def contractions : Hash(String, String) #

Contractions to be replaced


[View source]
def contractions=(contractions : Hash(String, String)) #

Contractions to be replaced


[View source]
def downcase : Bool #

Downcase all tokens?


[View source]
def downcase=(downcase : Bool) #

Downcase all tokens?


[View source]
def expand_contractions : Bool #

Do we want to expand contractions ("he's" => "he is")


[View source]
def expand_contractions=(expand_contractions : Bool) #

Do we want to expand contractions ("he's" => "he is")


[View source]
def filter_languages : Array(String | Symbol) #

Other languages to include in the filtering of abbreviations, contractions, and stop words


[View source]
def filter_languages=(filter_languages : Array(String | Symbol)) #

Other languages to include in the filtering of abbreviations, contractions, and stop words


[View source]
def hashtags : MentionsOptions #

What to do with hashtags (#awesome)


[View source]
def hashtags=(hashtags : MentionsOptions) #

What to do with hashtags (#awesome)


[View source]
def long_word_split : Int32 #

The specified length to split long words at any hyphen or underscore


[View source]
def long_word_split=(long_word_split : Int32) #

The specified length to split long words at any hyphen or underscore


[View source]
def mentions : MentionsOptions #

What to do with mentions (@watzon)


[View source]
def mentions=(mentions : MentionsOptions) #

What to do with mentions (@watzon)


[View source]
def minimum_length : Int32 #

Minimum length for tokens


[View source]
def minimum_length=(minimum_length : Int32) #

Minimum length for tokens


[View source]
def numbers : NumbersOptions #

What to do with numbers


[View source]
def numbers=(numbers : NumbersOptions) #

What to do with numbers


[View source]
def punctuation : PunctuationOptions #

What to do with punctuation


[View source]
def punctuation=(punctuation : PunctuationOptions) #

What to do with punctuation


[View source]
def remove_domains : Bool #

Should we remove domains


[View source]
def remove_domains=(remove_domains : Bool) #

Should we remove domains


[View source]
def remove_emails : Bool #

Should we remove emails


[View source]
def remove_emails=(remove_emails : Bool) #

Should we remove emails


[View source]
def remove_emoji : Bool #

Should we remove emojis


[View source]
def remove_emoji=(remove_emoji : Bool) #

Should we remove emojis


[View source]
def remove_stop_words : Bool #

Should we remove stop words


[View source]
def remove_stop_words=(remove_stop_words : Bool) #

Should we remove stop words


[View source]
def remove_urls : Bool #

Should we remove urls


[View source]
def remove_urls=(remove_urls : Bool) #

Should we remove urls


[View source]
def stop_words : Set(String) #

An array of stop words


[View source]
def stop_words=(stop_words : Set(String)) #

An array of stop words


[View source]
def tokenize(string : String) : Array(String) #

[View source]
def tokens : Array(String) #

Array of output tokens


[View source]