Added anonbib for use in papers list

Source: https://gitweb.torproject.org/anonbib.git Commit: b478fc493d4be2115185d94e077bf06196495417
2013-08-11 11:14:00 +00:00
parent e38ef21b71
commit 7c42ce8329
23 changed files with 11209 additions and 0 deletions
--- a/i2p2www/anonbib/reconcile.py
+++ b/i2p2www/anonbib/reconcile.py
@@ -0,0 +1,292 @@
+#!/usr/bin/python2
+# Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.
+
+"""Code to determine which entries are new and which are old.
+
+   To scan a new file, run "python reconcile.py anonbib.cfg new-file.bib".  This
+   will generate a new bibtex file called 'tmp.bib', with all the new entries
+   cleaned up a little, and all the duplicate entries commented out.
+"""
+
+import sys
+import re
+
+assert sys.version_info[:3] >= (2,2,0)
+
+import BibTeX
+import config
+import metaphone
+
+_MPCACHE = {}
+def soundsLike(s1, s2):
+    c = _MPCACHE
+    s1 = clean(s1)
+    s2 = clean(s2)
+    try:
+        m1 = c[s1]
+    except KeyError:
+        m1 = c[s1] = metaphone.metaphone(s1)
+    try:
+        m2 = c[s2]
+    except KeyError:
+        m2 = c[s2] = metaphone.metaphone(s2)
+
+    return m1 == m2
+
+def mphone(s):
+    c = _MPCACHE
+    s = clean(s)
+    try:
+        return c[s]
+    except:
+        m = c[s] = metaphone.metaphone(s)
+        return m
+
+def clean(s):
+    s = re.sub(r'\s+', ' ', s)
+    s = s.strip()
+    return s
+
+class MasterBibTeX(BibTeX.BibTeX):
+    def __init__(self):
+        BibTeX.BibTeX.__init__(self)
+
+    def buildIndex(self):
+        self.byTitle = {}
+        for ent in self.entries:
+            for t in self._titleForms(ent['title']):
+                self.byTitle.setdefault(t, []).append(ent)
+
+    def _titleForms(self, title):
+        title = title.lower()
+        title = re.sub(r'\b(an|a|the|of)\b', "", title)
+        title = clean(title)
+        res = [ mphone(title) ]
+        if ':' in title:
+            for t in title.split(":"):
+                res.append(mphone(t.strip()))
+        #print "%r\n   => %s" % (title,res)
+        return res
+
+    def _titlesAlike(self, t1, t2):
+        t1 = clean(t1)
+        t2 = clean(t2)
+        if t1 == t2:
+            return 2
+        tf1 = self._titleForms(t1)
+        tf2 = self._titleForms(t2)
+        for t in tf1:
+            if t in tf2: return 1
+        return 0
+
+    def _authorsAlike(self, a1, a2):
+        if not soundsLike(" ".join(a1.last)," ".join(a2.last)):
+            return 0
+
+        if (a1.first == a2.first and a1.von == a2.von
+            and a1.jr == a2.jr):
+            return 2
+
+
+        if soundsLike(" ".join(a1.first), " ".join(a2.first)):
+            return 1
+
+        if not a1.first or not a2.first:
+            return 1
+
+        if self._initialize(a1.first) == self._initialize(a2.first):
+            return 1
+
+        return 0
+
+    def _initialize(self, name):
+        name = " ".join(name).lower()
+        name = re.sub(r'([a-z])[a-z\.]*', r'\1', name)
+        name = clean(name)
+        return name
+
+    def _authorListsAlike(self, a1, a2):
+        if len(a1) != len(a2):
+            return 0
+        a1 = [ (a.last, a) for a in a1 ]
+        a2 = [ (a.last, a) for a in a2 ]
+        a1.sort()
+        a2.sort()
+        if len(a1) != len(a2):
+            return 0
+        r = 2
+        for (_, a1), (_, a2) in zip(a1,a2):
+            x = self._authorsAlike(a1,a2)
+            if not x:
+                return 0
+            elif x == 1:
+                r = 1
+        return r
+
+    def _entryDatesAlike(self, e1, e2):
+        try:
+            if clean(e1['year']) == clean(e2['year']):
+                return 2
+            else:
+                return 0
+        except KeyError:
+            return 1
+
+    def includes(self, ent, all=0):
+        title = ent['title']
+        candidates = []
+        for form in self._titleForms(title):
+            try:
+                candidates.extend(self.byTitle[form])
+            except KeyError:
+                pass
+        goodness = []
+        for knownEnt in candidates:
+            match = (self._entryDatesAlike(ent, knownEnt) *
+                     self._titlesAlike(ent['title'], knownEnt['title']) *
+                     self._authorListsAlike(ent.parsedAuthor,
+                                            knownEnt.parsedAuthor) )
+            if match:
+                goodness.append((match, knownEnt))
+        goodness.sort()
+        if all:
+            return goodness
+        if goodness:
+            return goodness[-1]
+        else:
+            return None, None
+
+    def demo(self):
+        for e in self.entries:
+            matches = self.includes(e, 1)
+            m2 = []
+            mids = []
+            for g,m in matches:
+                if id(m) not in mids:
+                    mids.append(id(m))
+                    m2.append((g,m))
+            matches = m2
+
+            if not matches:
+                print "No match for %s"%e.key
+            if matches[-1][1] is e:
+                print "%s matches for %s: OK."%(len(matches), e.key)
+            else:
+                print "%s matches for %s: %s is best!" %(len(matches), e.key,
+                                                         matches[-1][1].key)
+            if len(matches) > 1:
+                for g, m in matches:
+                    print "%%%% goodness", g
+                    print m
+
+
+def noteToURL(note):
+    " returns tp, url "
+    note = note.replace("\n", " ")
+    m = re.match(r'\s*(?:\\newline\s*)*\s*\\url{(.*)}\s*(?:\\newline\s*)*',
+                 note)
+    if not m:
+        return None
+    url = m.group(1)
+    for suffix, tp in ((".html", "html"),
+                       (".ps", "ps"),
+                       (".ps.gz", "ps_gz"),
+                       (".pdf", "pdf"),
+                       (".txt", "txt")):
+        if url.endswith(suffix):
+            return tp,url
+    return "???", url
+
+all_ok = 1
+def emit(f,ent):
+    global all_ok
+
+    errs = ent._check()
+    if master.byKey.has_key(ent.key.strip().lower()):
+        errs.append("ERROR: Key collision with master file")
+
+    if errs:
+        all_ok = 0
+
+    note = ent.get("note")
+    if ent.getURL() and not note:
+        ent['note'] = "\url{%s}"%ent.getURL()
+    elif note:
+        m = re.match(r'\\url{(.*)}', note)
+        if m:
+            url = m.group(0)
+            tp = None
+            if url.endswith(".txt"):
+                tp = "txt"
+            elif url.endswith(".ps.gz"):
+                tp = "ps_gz"
+            elif url.endswith(".ps"):
+                tp = "ps_gz"
+            elif url.endswith(".pdf"):
+                tp = "pdf"
+            elif url.endswith(".html"):
+                tp = "html"
+            if tp:
+                ent['www_%s_url'%tp] = url
+
+    if errs:
+        all_ok = 0
+    for e in errs:
+        print >>f, "%%%%", e
+
+    print >>f, ent.format(77, 4, v=1, invStrings=invStrings)
+
+def emitKnown(f, ent, matches):
+    print >>f, "%% Candidates are:", ", ".join([e.key for g,e in matches])
+    print >>f, "%%"
+    print >>f, "%"+(ent.format(77,4,1,invStrings).replace("\n", "\n%"))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print "reconcile.py expects 2 arguments"
+        sys.exit(1)
+
+    config.load(sys.argv[1])
+
+    print "========= Scanning master =========="
+    master = MasterBibTeX()
+    master = BibTeX.parseFile(config.MASTER_BIB, result=master)
+    master.buildIndex()
+
+    print "========= Scanning new file ========"
+    try:
+        fn = sys.argv[2]
+        input = BibTeX.parseFile(fn)
+    except BibTeX.ParseError, e:
+        print "Error parsing %s: %s"%(fn,e)
+        sys.exit(1)
+
+    f = open('tmp.bib', 'w')
+    keys = input.newStrings.keys()
+    keys.sort()
+    for k in keys:
+        v = input.newStrings[k]
+        print >>f, "@string{%s = {%s}}"%(k,v)
+
+    invStrings = input.invStrings
+
+    for e in input.entries:
+        if not (e.get('title') and e.get('author')):
+            print >>f, "%%\n%%%% Not enough information to search for a match: need title and author.\n%%"
+            emit(f, e)
+            continue
+
+        matches = master.includes(e, all=1)
+        if not matches:
+            print >>f, "%%\n%%%% This entry is probably new: No match found.\n%%"
+            emit(f, e)
+        else:
+            print >>f, "%%"
+            print >>f, "%%%% Possible match found for this entry; max goodness",\
+                  matches[-1][0], "\n%%"
+            emitKnown(f, e, matches)
+
+    if not all_ok:
+        print >>f, "\n\n\nErrors remain; not finished.\n"
+
+    f.close()