author: //meta[@name="author"]/@content title: //meta[@property="og:title"]/@content date: //meta[@property="article:published_time"]/@content # //picture selector seems to cause problems with text extraction. # body: //picture[contains(@class, 'c-picture')] | //div[contains(@class, 'c-entry-content') or contains(@class, 'c-entry-hero__image')] body: //div[contains(@class, 'c-entry-content') or contains(@class, 'c-entry-hero__image')] # for vergecasts, e.g. http://www.theverge.com/2013/8/22/4648566/the-vergecast-090-august-22th-2013-video body: //article body: //div[contains(concat(' ',normalize-space(@class),' '),' l-col__main ')] strip: //aside strip: //nav strip_id_or_class: gallery strip_id_or_class: article-meta strip_id_or_class: story-navigation strip_id_or_class: slegend strip_id_or_class: related-product-meta strip_id_or_class: comments strip_id_or_class: ui-jump-list strip_id_or_class: pullquote strip_id_or_class: m-ad strip_id_or_class: social-sharing strip_id_or_class: m-video-entry__excerpt strip_id_or_class: hidden strip_id_or_class: m-article__follow-bar strip_id_or_class: m-article__share-buttons strip_id_or_class: l-col__sidebar strip_id_or_class: c-river strip_id_or_class: chorus-ad-placement strip_id_or_class: c-related-list strip_id_or_class: sr-only #2017 strip_id_or_class: e-image__meta replace_string( ): # 2023 # Categories strip: //*[@id="content"]/div[1]/ul strip: //li[contains(concat(' ',normalize-space(@class),' '),' inline ')] # Author blurb strip_id_or_class: duet--article--date-and-comments strip_id_or_class: duet--article--disclaimer strip_id_or_class: duet--article--article-byline # Header divider strip: //div[contains(concat(' ',normalize-space(@class),' '),' grow ')] # Fix double images strip: //div[contains(@class, 'duet--media--content-warning')]/img strip: //div[@aria-label="Zoom"]/img replace_string(