title: //h1[@class='article__title'] # We can have multiple authors author: //span[contains(concat(' ',normalize-space(@class),' '),' author__name ')] author: //div[@class="entry-meta"]//a[contains(@class, "url")] # Last edition date (if any) date: //time[@itemprop='dateModified']/@datetime # Publication date date: //time[@itemprop='datePublished']/@datetime body: //section[contains(concat(' ',normalize-space(@class), ' '), ' article__content ')] # Another body selector and strip for video-only links body: //section[contains(concat(' ',normalize-space(@class), ' '), ' video ')] strip: //div[contains(concat(' ',normalize-space(@class), ' '), ' related-content--video ')] strip: //section[contains(@class, 'services')] strip: //section[@id="js-capping"] strip: //footer[contains(@class, "article__footer-single")] # Remove "Lire aussi" blocks strip: //section[contains(concat(' ',normalize-space(@class),' '),' catcher ')] # Remove "Lire aussi" paragraphs (just containing "Lire" in strong and a link) strip: //p[contains(strong, 'Lire') and a] # Remove comments strip: //*[contains(@class, 'comments')] # Remove "Article réservé aux abonnés" strip: //p[@class='article__status'] # Remove "Lecture restreinte Votre abonnement n’autorise pas la lecture de cet article" strip: //section[@id='js-capping-old-article'] # Remove quotes highlighted in articles, doublons with content # We use parent::blockquote to avoid a remaining empty blockquote node strip: //p[@class='article__quote']/parent::blockquote # Remove share buttons strip: //ul[contains(@class, 'meta__social')] strip: //a[contains(@class, 'Header__offer')] # Remove gift modal strip_id_or_class: article__gift-modal strip_id_or_class: reuse__content # Remove the insane "conjugaison.lemonde.fr" links: find_string: