updated specific configuration for parsing

This commit is contained in:
Nicolas Lœuillet
2014-07-13 10:15:40 +02:00
parent 58dbe10388
commit 4e067ceabd
952 changed files with 7585 additions and 5682 deletions

38
inc/3rdparty/site_config/standard/joelonsoftware.com.txt vendored Normal file → Executable file
View File

@ -1,21 +1,21 @@
# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html
author: substring-after(//div[@class="author"], 'by ')
date: //div[@class="date"]
## Clean stuff at top ##
strip: //h1[1]
strip: //h2[1]
strip: //div[@class="date"]
strip: //div[@class="author"]
## Clean stuff at bottom ##
strip: //blockquote[@class="textmessage"]
strip: //div[@style="width:500px"]/p[last()]
strip: //div[@style="width:500px"]/p[last()-1]
strip: //div[@style="width:500px"]/h4[last()]
strip: //div[@style="width:500px"]/h4[last()-1]
# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html
author: substring-after(//div[@class="author"], 'by ')
date: //div[@class="date"]
## Clean stuff at top ##
strip: //h1[1]
strip: //h2[1]
strip: //div[@class="date"]
strip: //div[@class="author"]
## Clean stuff at bottom ##
strip: //blockquote[@class="textmessage"]
strip: //div[@style="width:500px"]/p[last()]
strip: //div[@style="width:500px"]/p[last()-1]
strip: //div[@style="width:500px"]/h4[last()]
strip: //div[@style="width:500px"]/h4[last()-1]
strip: //div[@style="width:500px"]/div[last()]
test_url: http://www.joelonsoftware.com/items/2011/09/15.html