updated specific configuration for parsing

This commit is contained in:
Nicolas Lœuillet
2014-07-13 10:15:40 +02:00
parent 58dbe10388
commit 4e067ceabd
952 changed files with 7585 additions and 5682 deletions

21
inc/3rdparty/site_config/standard/allthingsd.com.txt vendored Normal file → Executable file
View File

@ -1,10 +1,13 @@
title://div[@class="article-title"]/h1[@class="title"]
date: //p[@class="article-date"]
body://*[@class="article-body article-text"]
# Trim out related posts at bottom of article
strip://blockquote[@class="memo"]
# Yup, no idea why author won't work...
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
title://div[@class="article-title"]/h1[@class="title"]
date: //p[@class="article-date"]
body://div[contains(@class, "article-body")]
# Trim out related posts at bottom of article
strip://blockquote[@class="memo"]
tidy: no
# Yup, no idea why author won't work...
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/