codelucas · codelucas · Jan 10, 2016 · Mar 27, 2015 · Mar 27, 2015 · Mar 27, 2015
diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -79,6 +79,18 @@ def get_authors(self, doc):
  def contains_digits(d):
  return bool(_digits.search(d))
 
+ def uniqify_list(l):
+ """Remove duplicates from provided list but maintain original order.
+ Derived from http://www.peterbe.com/plog/uniqifiers-benchmark
+ """
+ seen = {}
+ result = []
+ for item in l:
+ if item.lower() in seen: continue
+ seen[item.lower()] = 1
+ result.append(item.title())
+ return result
+
  def parse_byline(search_str):
  """Takes a candidate line of html or text and
  extracts out the name(s) in list form
@@ -95,21 +107,19 @@ def parse_byline(search_str):
  search_str = search_str.strip()
 
  # Chunk the line by non alphanumeric tokens (few name exceptions)
- # >>> re.split("[^\w\'\-]", "Lucas Ou, Dean O'Brian and Ronald")
- # ['Lucas Ou', '', 'Dean O'Brian', 'and', 'Ronald']
- name_tokens = re.split("[^\w\'\-]", search_str)
+ # >>> re.split("[^\w\'\-\.]", "Tyler G. Jones, Lucas Ou, Dean O'Brian and Ronald")
+ # ['Tyler', 'G.', 'Jones', '', 'Lucas', 'Ou', '', 'Dean', "O'Brian", 'and', 'Ronald']
+ name_tokens = re.split("[^\w\'\-\.]", search_str)
  name_tokens = [s.strip() for s in name_tokens]
 
  _authors = []
  # List of first, last name tokens
  curname = []
- DELIM = ['and', '']
+ DELIM = ['and', ',', '']
 
  for token in name_tokens:
  if token in DELIM:
- # should we allow middle names?
- valid_name = (len(curname) == 2)
- if valid_name:
+ if len(curname) > 0:
  _authors.append(' '.join(curname))
  curname = []
 
@@ -126,9 +136,9 @@ def parse_byline(search_str):
  # Try 1: Search popular author tags for authors
 
  ATTRS = ['name', 'rel', 'itemprop', 'class', 'id']
- VALS = ['author', 'byline']
+ VALS = ['author', 'byline', 'dc.creator']
  matches = []
- _authors, authors = [], []
+ authors = []
 
  for attr in ATTRS:
  for val in VALS:
@@ -145,13 +155,9 @@ def parse_byline(search_str):
  else:
  content = match.text or ''
  if len(content) > 0:
- _authors.extend(parse_byline(content))
+ authors.extend(parse_byline(content))
 
- uniq = list(set([s.lower() for s in _authors]))
- for name in uniq:
- names = [w.capitalize() for w in name.split(' ')]
- authors.append(' '.join(names))
- return authors or []
+ return uniqify_list(authors)
 
  # TODO Method 2: Search raw html for a by-line
  # match = re.search('By[\: ].*\\n|From[\: ].*\\n', html)

diff --git a/tests/data/html/cnn_article.html b/tests/data/html/cnn_article.html
@@ -13,7 +13,7 @@
 <meta content="A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel." itemprop="description" name="description" property="og:description"/>
 <meta content="winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm" itemprop="keywords" name="keywords"/>
 <meta content="winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm" name="news_keywords"/>
-<meta content="Dana Ford and Tom Watkins, CNN" itemprop="author" name="author"/>
+<meta content="Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN" itemprop="author" name="author"/>
 <meta content="travel" itemprop="articleSection" name="section"/>
 <meta content="CNN" itemprop="sourceOrganization" name="source"/>
 <meta content="" name="subsection"/>
@@ -68,7 +68,7 @@
 cnnBrandingValue="default";
 cnnPartnerValue="";
 cnnOmniBranding="",
-cnnAuthor="Dana Ford and Tom Watkins, CNN",
+cnnAuthor="Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN",
 disqus_category_id=207582,
 disqus_identifier="/2013/11/27/travel/weather-thanksgiving/index.html",
 disqus_title="After storm, forecasters see smooth sailing for Thanksgiving",
@@ -87,7 +87,7 @@
 business: {
 cnn: {
 page: {
-author: "Dana Ford and Tom Watkins, CNN",
+author: "Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN",
 broadcast_franchise: "",
 video_embed_count: "4",
 publish_date: "2013/11/27",
@@ -294,7 +294,7 @@ <h1>After storm, forecasters see smooth sailing for Thanksgiving</h1>
 <!--endclickprintinclude--><!--startclickprintexclude-->
 <!--no partner-->
 <div class="cnn_stryathrtmp">
-<div class="cnnByline">By <strong>Dana Ford </strong>and<strong> Tom Watkins, </strong>CNN</div>
+<div class="cnnByline">By <strong>Dana A. Ford </strong>, <strong>James S.A. Corey</strong>, <strong>Chien-Ming Wang</strong>, and<strong> Tom Watkins, </strong>CNN</div>
 <div class="cnn_strytmstmp">November 28, 2013 -- Updated 0203 GMT (1003 HKT)</div>
 </div>
 <!--google_ad_section_end-->
@@ -1022,7 +1022,7 @@ <h2>
 cnnsocial.share.setconfig(cnn_shareconfig);
 $j(document).ready(function () {
 'use strict';
-loadChartbeat("travel", "Dana Ford and Tom Watkins, CNN");
+loadChartbeat("travel", "Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN");
 CNN.initFlipperTicker();
 /* initialize cnnsocial */
 cnnsocial.init();
@@ -1041,4 +1041,4 @@ <h2>
 </script>
 <script src="http://cache-02.cleanprint.net/cpf/cleanprint?key=cnn" name="cleanprintloader"></script>
 </body>
-</html>
+</html>
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -158,7 +158,7 @@ def test_url(self):
  def test_download_html(self):
  html = mock_resource_with('cnn_article', 'html')
  self.article.download(html)
- assert len(self.article.html) == 75175
+ assert len(self.article.html) == 75406
 
  @print_test
  def test_pre_download_parse(self):
@@ -169,7 +169,7 @@ def test_pre_download_parse(self):
 
  @print_test
  def test_parse_html(self):
- AUTHORS = ['Dana Ford', 'Tom Watkins']
+ AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins']
  TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
  LEN_IMGS = 46
  META_LANG = 'en'
@@ -209,7 +209,7 @@ def test_meta_extraction(self):
  'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
  'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'},
  'section': 'travel',
- 'author': 'Dana Ford and Tom Watkins, CNN',
+ 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
  'robots': 'index,follow',
  'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
  'source': 'CNN',