# HTML5 anyone? The 1980s called, they want their HTML4 back.
# LWN uses so little markup that you really have to be creative.
tidy: yes
prune: no
single_page_link: //div[@class='ArticleText']//a[contains(text(), 'Full Story')]/@href
single_page_link: concat(//div[@class='ArticleText']//a[contains(text(), 'Read more')]/@href, 'bigpage')
if_page_contains: //div[@class='ArticleText']//a[contains(text(), 'Read more')]
title: //h1
# After tiding the document, becomes .
author: //div[@class='FeatureByline']/strong
date: //div[@class='FeatureByline']/text()[preceding-sibling::br]
strip: //div[@class='FeatureByline']
author: substring-after(//div[@class='GAByline']/p[2], 'by ')
date: //div[@class='GAByline']/p[1]
strip: //div[@class='GAByline']
# tidy will take care of fixing the tag mess that we make here.
replace_string():
replace_string():
replace_string(
):
# Make extracting the content before "Log in to post comments" easier.
# And by "easier" I mean possible in all cases without going through
# a lot of XPath pain.
replace_string(
):
strip: //div[@class='ftrss-strip']
body: //div[@class='ArticleText']
strip: //table[@class='Form']
requires_login: yes
login_uri: https://lwn.net/Login/
login_username_field: Username
login_password_field: Password
not_logged_in_xpath: /html/body/div[3]/div[1]/form[@class="loginform"]
test_url: http://lwn.net/Articles/668318/
test_url: http://lwn.net/Articles/668695/
test_url: http://lwn.net/Articles/669114/
test_url: http://lwn.net/Articles/670209/
test_url: http://lwn.net/Articles/670209/rss
test_url: http://lwn.net/Articles/668318/rss
test_url: http://lwn.net/Articles/670062/