class Readability::Document
- Readability::Document
- Reference
- Object
Defined in:
readability/document.crConstant Summary
-
ELEMENT_SCORES =
{"div" => 5.0, "blockquote" => 3.0, "form" => -3.0, "th" => -5.0}
-
PARSE_OPTS =
(XML::HTMLParserOptions::NODEFDTD | XML::HTMLParserOptions::NOIMPLIED) | XML::HTMLParserOptions::NOBLANKS
-
REGEXES =
{:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|footer|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|search/i, :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i, :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, :replaceBrsRe => /(?<content>.*)(<br[^>]*>[ \n\r\t]*){2,}/i, :replaceFontsRe => /<(\/?)font[^>]*>/i, :trimRe => /^\s+|\s+$/, :normalizeRe => /\s{2,}/, :killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/, :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i}
-
SAVE_OPTS =
(XML::SaveOptions::NO_DECL | XML::SaveOptions::AS_HTML) | XML::SaveOptions::NO_EMPTY
Constructors
Class Method Summary
-
.css_query_to_xpath(query : String) : String
Transform the css query into an xpath query https://github.com/madeindjs/Crystagiri/blob/master/src/crystagiri/html.cr
- .html_from_input(input, options = XML::HTMLParserOptions::NODEFDTD)
Instance Method Summary
-
#author
Look through the @html document looking for the author Precedence Information here on the wiki: (TODO: attach wiki URL if it is accepted) Returns nil if no author is detected
- #best_candidate_has_image : Bool
- #best_candidate_has_image=(best_candidate_has_image : Bool)
- #class_weight(e)
- #clean_conditionally(node, candidates, selector)
- #clean_conditionally_reason?(name, counts, content_length, weight, link_density)
- #content(remove_unlikely_candidates = :default)
- #debug(str)
- #get_article(candidates, best_candidate)
- #get_link_density(elem)
- #handle_exclusions!(whitelist, blacklist)
- #html : XML::Node
- #html=(html : XML::Node)
- #meta_image
- #options : Readability::Options
- #options=(options : Readability::Options)
-
#prepare_candidates
This method only touches @html instance variable
- #remove_unlikely_candidates!
- #sanitize(node, candidates = {} of Any => Any)
- #score_node(elem)
- #score_paragraphs(min_text_length : Int32) : Hash(XML::Node, Readability::NodeScore)
- #select_best_candidate(candidates)
- #title
- #transform_misused_divs_into_paragraphs!
Constructor Detail
Class Method Detail
def self.css_query_to_xpath(query : String) : String
#
Transform the css query into an xpath query https://github.com/madeindjs/Crystagiri/blob/master/src/crystagiri/html.cr
Instance Method Detail
def author
#
Look through the @html document looking for the author Precedence Information here on the wiki: (TODO: attach wiki URL if it is accepted) Returns nil if no author is detected
def score_paragraphs(min_text_length : Int32) : Hash(XML::Node, Readability::NodeScore)
#