Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse byline fix #132

Merged
merged 8 commits into from
Jan 10, 2016
Merged
36 changes: 21 additions & 15 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,18 @@ def get_authors(self, doc):
def contains_digits(d):
return bool(_digits.search(d))

def uniqify_list(l):
"""Remove duplicates from provided list but maintain original order.
Derived from http://www.peterbe.com/plog/uniqifiers-benchmark
"""
seen = {}
result = []
for item in l:
if item.lower() in seen: continue
seen[item.lower()] = 1
result.append(item.title())
return result

def parse_byline(search_str):
"""Takes a candidate line of html or text and
extracts out the name(s) in list form
Expand All @@ -95,21 +107,19 @@ def parse_byline(search_str):
search_str = search_str.strip()

# Chunk the line by non alphanumeric tokens (few name exceptions)
# >>> re.split("[^\w\'\-]", "Lucas Ou, Dean O'Brian and Ronald")
# ['Lucas Ou', '', 'Dean O'Brian', 'and', 'Ronald']
name_tokens = re.split("[^\w\'\-]", search_str)
# >>> re.split("[^\w\'\-\.]", "Tyler G. Jones, Lucas Ou, Dean O'Brian and Ronald")
# ['Tyler', 'G.', 'Jones', '', 'Lucas', 'Ou', '', 'Dean', "O'Brian", 'and', 'Ronald']
name_tokens = re.split("[^\w\'\-\.]", search_str)
name_tokens = [s.strip() for s in name_tokens]

_authors = []
# List of first, last name tokens
curname = []
DELIM = ['and', '']
DELIM = ['and', ',', '']

for token in name_tokens:
if token in DELIM:
# should we allow middle names?
valid_name = (len(curname) == 2)
if valid_name:
if len(curname) > 0:
_authors.append(' '.join(curname))
curname = []

Expand All @@ -126,9 +136,9 @@ def parse_byline(search_str):
# Try 1: Search popular author tags for authors

ATTRS = ['name', 'rel', 'itemprop', 'class', 'id']
VALS = ['author', 'byline']
VALS = ['author', 'byline', 'dc.creator']
matches = []
_authors, authors = [], []
authors = []

for attr in ATTRS:
for val in VALS:
Expand All @@ -145,13 +155,9 @@ def parse_byline(search_str):
else:
content = match.text or ''
if len(content) > 0:
_authors.extend(parse_byline(content))
authors.extend(parse_byline(content))

uniq = list(set([s.lower() for s in _authors]))
for name in uniq:
names = [w.capitalize() for w in name.split(' ')]
authors.append(' '.join(names))
return authors or []
return uniqify_list(authors)

# TODO Method 2: Search raw html for a by-line
# match = re.search('By[\: ].*\\n|From[\: ].*\\n', html)
Expand Down
12 changes: 6 additions & 6 deletions tests/data/html/cnn_article.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<meta content="A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel." itemprop="description" name="description" property="og:description"/>
<meta content="winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm" itemprop="keywords" name="keywords"/>
<meta content="winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm" name="news_keywords"/>
<meta content="Dana Ford and Tom Watkins, CNN" itemprop="author" name="author"/>
<meta content="Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN" itemprop="author" name="author"/>
<meta content="travel" itemprop="articleSection" name="section"/>
<meta content="CNN" itemprop="sourceOrganization" name="source"/>
<meta content="" name="subsection"/>
Expand Down Expand Up @@ -68,7 +68,7 @@
cnnBrandingValue="default";
cnnPartnerValue="";
cnnOmniBranding="",
cnnAuthor="Dana Ford and Tom Watkins, CNN",
cnnAuthor="Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN",
disqus_category_id=207582,
disqus_identifier="/2013/11/27/travel/weather-thanksgiving/index.html",
disqus_title="After storm, forecasters see smooth sailing for Thanksgiving",
Expand All @@ -87,7 +87,7 @@
business: {
cnn: {
page: {
author: "Dana Ford and Tom Watkins, CNN",
author: "Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN",
broadcast_franchise: "",
video_embed_count: "4",
publish_date: "2013/11/27",
Expand Down Expand Up @@ -294,7 +294,7 @@ <h1>After storm, forecasters see smooth sailing for Thanksgiving</h1>
<!--endclickprintinclude--><!--startclickprintexclude-->
<!--no partner-->
<div class="cnn_stryathrtmp">
<div class="cnnByline">By <strong>Dana Ford </strong>and<strong> Tom Watkins, </strong>CNN</div>
<div class="cnnByline">By <strong>Dana A. Ford </strong>, <strong>James S.A. Corey</strong>, <strong>Chien-Ming Wang</strong>, and<strong> Tom Watkins, </strong>CNN</div>
<div class="cnn_strytmstmp">November 28, 2013 -- Updated 0203 GMT (1003 HKT)</div>
</div>
<!--google_ad_section_end-->
Expand Down Expand Up @@ -1022,7 +1022,7 @@ <h2>
cnnsocial.share.setconfig(cnn_shareconfig);
$j(document).ready(function () {
'use strict';
loadChartbeat("travel", "Dana Ford and Tom Watkins, CNN");
loadChartbeat("travel", "Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN");
CNN.initFlipperTicker();
/* initialize cnnsocial */
cnnsocial.init();
Expand All @@ -1041,4 +1041,4 @@ <h2>
</script>
<script src="http://cache-02.cleanprint.net/cpf/cleanprint?key=cnn" name="cleanprintloader"></script>
</body>
</html>
</html>
6 changes: 3 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_url(self):
def test_download_html(self):
html = mock_resource_with('cnn_article', 'html')
self.article.download(html)
assert len(self.article.html) == 75175
assert len(self.article.html) == 75406

@print_test
def test_pre_download_parse(self):
Expand All @@ -169,7 +169,7 @@ def test_pre_download_parse(self):

@print_test
def test_parse_html(self):
AUTHORS = ['Dana Ford', 'Tom Watkins']
AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins']
TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
LEN_IMGS = 46
META_LANG = 'en'
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_meta_extraction(self):
'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'},
'section': 'travel',
'author': 'Dana Ford and Tom Watkins, CNN',
'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
'robots': 'index,follow',
'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
'source': 'CNN',
Expand Down