class Readability::Document

Defined in:

readability/document.cr

Constant Summary

ELEMENT_SCORES = {"div" => 5.0, "blockquote" => 3.0, "form" => -3.0, "th" => -5.0}
PARSE_OPTS = (XML::HTMLParserOptions::NODEFDTD | XML::HTMLParserOptions::NOIMPLIED) | XML::HTMLParserOptions::NOBLANKS
REGEXES = {:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|footer|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|search/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, :replaceBrsRe => /(?<content>.*)(<br[^>]*>[ \n\r\t]*){2,}/i, :replaceFontsRe => /<(\/?)font[^>]*>/i, :trimRe => /^\s+|\s+$/, :normalizeRe => /\s{2,}/, :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/, :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i}
SAVE_OPTS = (XML::SaveOptions::NO_DECL | XML::SaveOptions::AS_HTML) | XML::SaveOptions::NO_EMPTY

Constructors

Class Method Summary

Instance Method Summary

Constructor Detail

def self.new(input : String, options : Readability::Options = Options.new) #

[View source]

Class Method Detail

def self.css_query_to_xpath(query : String) : String #

Transform the css query into an xpath query https://github.com/madeindjs/Crystagiri/blob/master/src/crystagiri/html.cr


[View source]
def self.html_from_input(input, options = XML::HTMLParserOptions::NODEFDTD) #

[View source]

Instance Method Detail

def author #

Look through the @html document looking for the author Precedence Information here on the wiki: (TODO: attach wiki URL if it is accepted) Returns nil if no author is detected


[View source]
def best_candidate_has_image : Bool #

[View source]
def best_candidate_has_image=(best_candidate_has_image : Bool) #

[View source]
def class_weight(e) #

[View source]
def clean_conditionally(node, candidates, selector) #

[View source]
def clean_conditionally_reason?(name, counts, content_length, weight, link_density) #

[View source]
def content(remove_unlikely_candidates = :default) #

[View source]
def debug(str) #

[View source]
def get_article(candidates, best_candidate) #

[View source]
def get_link_density(elem) #

[View source]
def handle_exclusions!(whitelist, blacklist) #

[View source]
def html : XML::Node #

[View source]
def html=(html : XML::Node) #

[View source]
def meta_image #

[View source]
def options : Readability::Options #

[View source]
def options=(options : Readability::Options) #

[View source]
def prepare_candidates #

This method only touches @html instance variable


[View source]
def remove_unlikely_candidates! #

[View source]
def sanitize(node, candidates = {} of Any => Any) #

[View source]
def score_node(elem) #

[View source]
def score_paragraphs(min_text_length : Int32) : Hash(XML::Node, Readability::NodeScore) #

[View source]
def select_best_candidate(candidates) #

[View source]
def title #

[View source]
def transform_misused_divs_into_paragraphs! #

[View source]