class Cadmium::PragmaticTokenizer::Languages::Common

Direct Known Subclasses

Defined in:

cadmium/tokenizer/pragmatic/languages/common.cr

Constant Summary

ABBREVIATIONS = Set(String).new
ALNUM_QUOTE = /(\w|\D)'(?!')(?=\W|$)/

Single quotes handling

CONTRACTIONS = {} of String => String
PUNCTUATION_MAP = {"。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭"}
QUOTE_NOT_TWAS1 = /(\W|^)'(?!twas)/i
QUOTE_NOT_TWAS2 = /(\W|^)‘(?!twas)/i
QUOTE_WORD = /(\W|^)'(?=\w)/
STOP_WORDS = Set(String).new

Class Method Summary

Class Method Detail

def self.abbreviations #

[View source]
def self.contractions #

[View source]
def self.handle_single_quotes(text) #

This 'special treatment' is actually relevant for many other tests. Alter core regular expressions!


[View source]
def self.punctuation_map #

[View source]
def self.stop_words #

[View source]