Added anonbib for use in papers list

Source: https://gitweb.torproject.org/anonbib.git Commit: b478fc493d4be2115185d94e077bf06196495417
2013-08-11 11:14:00 +00:00
parent e38ef21b71
commit 7c42ce8329
23 changed files with 11209 additions and 0 deletions
--- a/i2p2www/anonbib/rank.py
+++ b/i2p2www/anonbib/rank.py
@@ -0,0 +1,202 @@
+# Make rankings of papers and authors for automatic classification of content hotness
+
+# Google Scholar address
+# http://scholar.google.com/scholar?as_epq=
+
+# Take care of the caching setup
+cache_expire = 60*60*24*30 # 30 days
+
+# Checks
+import config
+import os
+import sys
+from os.path import exists, isdir, join, getmtime
+from os import listdir, remove
+
+def remove_old():
+   # Remove all old cached files
+   filenames = listdir(cache_folder())
+   from time import time
+   now = time()
+   for f in filenames:
+      pf = join(cache_folder(), f)
+      time_mt =  getmtime(pf)
+      if now - time_mt > cache_expire: # 30 days
+         remove(pf)
+
+def cache_folder():
+   r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
+   if not exists(r):
+      os.makedirs(r)
+   assert isdir(r)
+   return r
+
+import re
+from urllib2 import urlopen, build_opener
+from urllib import quote
+from datetime import date
+import hashlib
+
+# A more handy hash
+def md5h(s):
+   m = hashlib.md5()
+   m.update(s)
+   return m.hexdigest()
+
+format_tested = 0
+
+def getPageForTitle(title, cache=True, update=True, save=True):
+   #Returns (citation-count, scholar url) tuple, or (None,None)
+   global format_tested
+   if not format_tested and update:
+      format_tested = 1
+      TestScholarFormat()
+
+   # Do not assume that the title is clean
+   title = re.sub("\s+", " ", title)
+   title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
+   title = re.sub("'\/", " ", title)
+
+   # We rely on google scholar to return the article with this exact title
+   gurl = "http://scholar.google.com/scholar?as_q=&as_epq=%s&as_occt=title"
+
+   url = gurl % quote(title)
+
+   # Access cache or network
+   if exists(join(cache_folder(), md5h(url))) and cache:
+      return url, file(join(cache_folder(), md5h(url)),'r').read()
+   elif update:
+      print "Downloading rank for %r."%title
+
+      # Make a custom user agent (so that we are not filtered by Google)!
+      opener = build_opener()
+      opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
+
+      print "connecting..."
+      connection = opener.open(url)
+      print "reading"
+      page = connection.read()
+      print "done"
+      if save:
+         file(join(cache_folder(), md5h(url)),'w').write(page)
+      return url, page
+   else:
+      return url, None
+
+def getCite(title, cache=True, update=True, save=True):
+   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
+   if not page:
+      return None,None
+
+   # Check if it finds any articles
+   if len(re.findall("did not match any articles", page)) > 0:
+      return (None, None)
+
+   # Kill all tags!
+   cpage = re.sub("<[^>]*>", "", page)
+
+   # Add up all citations
+   s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
+   return (s, url)
+
+def getPaperURLs(title, cache=True, update=True, save=True):
+   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
+   if not page:
+      return []
+   pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
+   return pages
+
+def get_rank_html(title, years=None, base_url=".", update=True,
+                  velocity=False):
+   s,url = getCite(title, update=update)
+
+   # Paper cannot be found
+   if s is None:
+      return ''
+
+   html = ''
+
+   url = url.replace("&","&amp;")
+
+   # Hotness
+   H,h = 50,5
+   if s >= H:
+      html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H)
+   elif s >= h:
+      html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h)
+
+   # Only include the velocity if asked.
+   if velocity:
+      # Velocity
+      d = date.today().year - int(years)
+      if d >= 0:
+         if 2 < s / (d +1) < 10:
+            html += '<img src="%s/ups.gif" />' % base_url
+         if 10 <= s / (d +1):
+            html += '<img src="%s/upb.gif" />' % base_url
+
+   return html
+
+def TestScholarFormat():
+   # We need to ensure that Google Scholar does not change its page format under our feet
+   # Use some cases to check if all is good
+   print "Checking google scholar formats..."
+   stopAndGoCites = getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0]
+   dragonCites = getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0]
+
+   if stopAndGoCites in (0, None):
+      print """OOPS.\n
+It looks like Google Scholar changed their URL format or their output format.
+I went to count the cites for the Stop-and-Go MIXes paper, and got nothing."""
+      sys.exit(1)
+
+   if dragonCites != None:
+      print """OOPS.\n
+It looks like Google Scholar changed their URL format or their output format.
+I went to count the cites for a fictitious paper, and found some."""
+      sys.exit(1)
+
+def urlIsUseless(u):
+   if u.find("freehaven.net/anonbib/") >= 0:
+      # Our own cache is not the primary citation for anything.
+      return True
+   elif u.find("owens.mit.edu") >= 0:
+      # These citations only work for 'members of the MIT community'.
+      return True
+   else:
+      return False
+
+URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]
+
+if __name__ == '__main__':
+   # First download the bibliography file.
+   import BibTeX
+   suggest = False
+   if sys.argv[1] == 'suggest':
+      suggest = True
+      del sys.argv[1]
+
+   config.load(sys.argv[1])
+   if config.CACHE_UMASK != None:
+      os.umask(config.CACHE_UMASK)
+   bib = BibTeX.parseFile(config.MASTER_BIB)
+   remove_old()
+
+   print "Downloading missing ranks."
+   for ent in bib.entries:
+      getCite(ent['title'], cache=True, update=True)
+
+   if suggest:
+      for ent in bib.entries:
+         haveOne = False
+         for utype in URLTYPES:
+            if ent.has_key("www_%s_url"%utype):
+               haveOne = True
+               break
+         if haveOne:
+            continue
+         print ent.key, "has no URLs given."
+         urls = [ u for u in getPaperURLs(ent['title']) if not urlIsUseless(u) ]
+         for u in urls:
+            print "\t", u
+