# HTML5 anyone? The 1980s called, they want their HTML4 back. # LWN uses so little markup that you really have to be creative. tidy: yes prune: no single_page_link: //div[@class='ArticleText']//a[contains(text(), 'Full Story')]/@href single_page_link: concat(//div[@class='ArticleText']//a[contains(text(), 'Read more')]/@href, 'bigpage') if_page_contains: //div[@class='ArticleText']//a[contains(text(), 'Read more')] title: //h1 # After tiding the document, becomes . author: //div[@class='FeatureByline']/strong date: //div[@class='FeatureByline']/text()[preceding-sibling::br] strip: //div[@class='FeatureByline'] author: substring-after(//div[@class='GAByline']/p[2], 'by ') date: //div[@class='GAByline']/p[1] strip: //div[@class='GAByline'] # tidy will take care of fixing the tag mess that we make here. replace_string(

):

replace_string(

):

replace_string(

):

# Make extracting the content before "Log in to post comments" easier. # And by "easier" I mean possible in all cases without going through # a lot of XPath pain. replace_string(
):
replace_string(to post comments)):
strip: //div[@class='ftrss-strip'] body: //div[@class='ArticleText'] strip: //table[@class='Form'] requires_login: yes login_uri: https://lwn.net/Login/ login_username_field: Username login_password_field: Password not_logged_in_xpath: /html/body/div[3]/div[1]/form[@class="loginform"] test_url: http://lwn.net/Articles/668318/ test_url: http://lwn.net/Articles/668695/ test_url: http://lwn.net/Articles/669114/ test_url: http://lwn.net/Articles/670209/ test_url: http://lwn.net/Articles/670209/rss test_url: http://lwn.net/Articles/668318/rss test_url: http://lwn.net/Articles/670062/