cite-as / bibtex.py
raannakasturi's picture
Upload 124 files
dc5b905 verified
#!/usr/bin/python
# -*- coding: utf-8 -*-
from citeproc.py2compat import *
# copied from https://github.com/brechtm/citeproc-py/blob/master/citeproc/source/bibtex/bibtex.py
# then modified to fix bugs. search for "hap" to see mods
import re
import unicodedata
from warnings import warn
from citeproc.types import (
ARTICLE,
ARTICLE_JOURNAL,
BOOK,
CHAPTER,
MANUSCRIPT,
PAMPHLET,
PAPER_CONFERENCE,
REPORT,
THESIS,
)
from citeproc.string import String, MixedString, NoCase
from citeproc.source import BibliographySource, Reference, Name, Date, DateRange
from citeproc.source.bibtex.bibparse import BibTeXParser
from citeproc.source.bibtex.latex import parse_latex
from citeproc.source.bibtex.latex.macro import NewCommand, Macro
class BibTeX(BibliographySource):
fields = {
"address": "publisher_place",
"annote": "annote",
"author": "author",
"booktitle": "container_title",
"chapter": "chapter_number",
"edition": "edition",
"editor": "editor",
# 'howpublished': None,
# 'institution': None,
"journal": "container_title",
# 'month': None,
"note": "note",
"number": "issue",
# 'organization': None,
"pages": "page",
"publisher": "publisher",
# 'school': None,
"series": "collection_title",
"title": "title",
# 'type': None,
# 'year': None,
"volume": "volume",
# hap added doi and url
"doi": "doi",
"url": "url",
# non-standard fields
"isbn": "ISBN",
"issn": "ISSN",
}
types = { # standard entry types
"article": ARTICLE_JOURNAL,
"book": BOOK,
"booklet": PAMPHLET,
"conference": PAPER_CONFERENCE,
"inbook": CHAPTER,
"incollection": ARTICLE_JOURNAL,
"inproceedings": PAPER_CONFERENCE,
"manual": BOOK,
"mastersthesis": THESIS,
"misc": ARTICLE,
"phdthesis": THESIS,
"proceedings": BOOK,
"techreport": REPORT,
"unpublished": MANUSCRIPT,
# non-standard entry types
"thesis": THESIS,
"report": REPORT,
}
def __init__(self, filename, encoding="ascii"):
bibtex_database = BibTeXParser(filename)
bibtex_database.encoding = encoding
self.preamble_macros = {}
parse_latex(
bibtex_database.preamble,
{
"newcommand": NewCommand(self.preamble_macros),
"mbox": Macro(1, "{0}"),
"cite": Macro(1, "CITE({0})"),
},
)
for key, entry in bibtex_database.items():
self.add(self.create_reference(key, entry))
def _bibtex_to_csl(self, bibtex_entry):
csl_dict = {}
for field, value in bibtex_entry.items():
try:
value = value.strip()
except AttributeError:
pass
try:
csl_field = self.fields[field]
except KeyError:
csl_field = field
if field in ("number", "volume"):
try:
value = int(value)
except ValueError:
pass
elif field == "pages":
value = self._bibtex_to_csl_pages(value)
elif field in ("author", "editor"):
try:
value = [name for name in self._parse_author(value)]
except RuntimeError:
pass
else:
try:
value = self._parse_string(value)
except TypeError:
value = str(value)
csl_dict[csl_field] = value
# print("csl_dict: {}".format(csl_dict))
return csl_dict
@staticmethod
def _bibtex_to_csl_pages(value):
value = value.replace(" ", "")
if "-" in value:
try:
first, last = value.split("--")
except ValueError:
first, last = value.split("-")
pages = "-".join((first, last))
else:
pages = value[:-1] if value.endswith("+") else value
return pages
def _bibtex_to_csl_date(self, bibtex_entry):
# hap commented out the month section on feb 18, 2017 because was causing bugs,
# if 'month' in bibtex_entry:
# begin_dict, end_dict = self._parse_month(bibtex_entry['month'])
# else:
# begin_dict, end_dict = {}, {}
# hap replaced section above with this, ignoring the month.
begin_dict, end_dict = {}, {}
if "year" in bibtex_entry:
begin_dict["year"], end_dict["year"] = self._parse_year(
bibtex_entry["year"]
)
if not begin_dict:
return None
if begin_dict == end_dict:
return Date(**begin_dict)
else:
return DateRange(begin=Date(**begin_dict), end=Date(**end_dict))
def _parse_year(self, year):
try:
year_str = parse_latex(year, self.preamble_macros)
except TypeError:
year_str = str(year)
if EN_DASH in year_str:
begin_year, end_year = year_str.split(EN_DASH)
begin_len, end_len = len(begin_year), len(end_year)
if end_len < begin_len:
end_year = begin_year[: begin_len - end_len] + end_year
else:
begin_year = end_year = int(year_str)
return begin_year, end_year
MONTHS = (
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
)
RE_DAY = "(?P<day>\d+)"
RE_MONTH = "(?P<month>\w+)"
@staticmethod
def _parse_month(month):
def month_name_to_index(name):
try:
return BibTeX.MONTHS.index(name[:3].lower()) + 1
except ValueError:
return int(name)
begin = {}
end = {}
month = month.strip()
month = month.replace(", ", "-")
if month.isdecimal():
begin["month"] = end["month"] = month
elif month.replace("-", "").isalpha():
if "-" in month:
begin["month"], end["month"] = month.split("-")
else:
begin["month"] = end["month"] = month
else:
m = re.match(BibTeX.RE_DAY + "[ ~]*" + BibTeX.RE_MONTH, month)
if m is None:
m = re.match(BibTeX.RE_MONTH + "[ ~]*" + BibTeX.RE_DAY, month)
begin["day"] = end["day"] = int(m.group("day"))
begin["month"] = end["month"] = m.group("month")
begin["month"] = month_name_to_index(begin["month"])
end["month"] = month_name_to_index(end["month"])
return begin, end
def _parse_string(self, title):
def make_string(string, top_level_group=False):
unlatexed = parse_latex(string, self.preamble_macros)
fixed_case = top_level_group and not string.startswith("\\")
string_cls = NoCase if fixed_case else String
return string_cls(unlatexed)
title = str(title)
title = title.replace("\n", "")
title = " ".join(title.split())
output = MixedString()
level = 0
string = ""
for char in title:
if char == "{":
if level == 0:
if string:
output += make_string(string)
string = ""
level += 1
elif char == "}":
level -= 1
if level == 0:
output += make_string(string, True)
string = ""
else:
string += char
if level != 0:
raise SyntaxError('Non-matching braces in "{}"'.format(title))
if string:
output += make_string(string)
return output
def _parse_author(self, authors):
csl_authors = []
for author in split_names(authors):
first, von, last, jr = parse_name(author)
csl_parts = {}
for part, csl_label in [
(first, "given"),
(von, "non-dropping-particle"),
(last, "family"),
(jr, "suffix"),
]:
if part is not None:
csl_parts[csl_label] = parse_latex(part, self.preamble_macros)
name = Name(**csl_parts)
csl_authors.append(name)
return csl_authors
def create_reference(self, key, bibtex_entry):
csl_type = self.types[bibtex_entry.document_type]
csl_fields = self._bibtex_to_csl(bibtex_entry)
csl_date = self._bibtex_to_csl_date(bibtex_entry)
if csl_date:
csl_fields["issued"] = csl_date
ref = Reference(key, csl_type, **csl_fields)
return ref
# BibTeX name handling
#
# references
# - BibTeXing by Oren Patashnik (Feb 8, 1988), 4. Helpful Hints, item 18
# (BibTeX 0.99d - http://www.ctan.org/tex-archive/biblio/bibtex/base/btxdoc.pdf)
# - A summary of BibTex by Xavier Décoret
# (http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html)
# - Tame the BeaST by Nicolas Markey
# (http://tug.ctan.org/info/bibtex/tamethebeast/ttb_en.pdf)
AND = " and "
def split_names(string):
"""Split a string of names separated by 'and' into a list of names."""
brace_level = 0
names = []
last_index = 0
for i in range(len(string)):
char = string[i]
if brace_level == 0 and string[i:].startswith(AND):
names.append(string[last_index:i])
last_index = i + len(AND)
elif char == "{":
brace_level += 1
elif char == "}":
brace_level -= 1
last_name = string[last_index:]
if last_name:
names.append(last_name)
return names
def parse_name(name):
"""Parse a BibTeX name string and split it into First, von, Last and Jr
parts.
"""
parts = split_name(name)
if len(parts) == 1: # First von Last
(first_von_last,) = parts
index = 0
first, jr = [], []
for word in first_von_last[:-1]:
if is_capitalized(word) not in (True, None):
break
first.append(word)
index += 1
von_last = first_von_last[index:]
elif len(parts) == 2: # von Last, First
jr = []
von_last, first = parts
elif len(parts) == 3: # von Last, Jr, First
von_last, jr, first = parts
von, last = split_von_last(von_last)
join = " ".join
return join(first) or None, join(von) or None, join(last), join(jr) or None
def split_name(name):
"""Split a name in into parts delimited by commas (at brace-level 0), and
each part into words.
Returns a list of of lists of words.
"""
brace_level = 0
parts = []
current_part = []
word = ""
for char in name:
if char in " \t,":
if brace_level == 0:
if word:
current_part.append(word)
word = ""
if char == ",":
parts.append(current_part)
current_part = []
continue
elif char == "{":
brace_level += 1
elif char == "}":
brace_level -= 1
word += char
if word:
current_part.append(word)
parts.append(current_part)
return parts
def is_capitalized(string):
"""Check if a BibTeX substring is capitalized.
A string can be "case-less", in which case `None` is returned.
"""
brace_level = 0
special_char = False
for char, next_char in lookahead_iter(string):
if (brace_level == 0 or special_char) and char.isalpha():
return char.isupper()
elif char == "{":
brace_level += 1
if brace_level == 1 and next_char == "\\":
special_char = True
elif char == "}":
brace_level -= 1
if brace_level == 0:
special_char = False
return None # case-less
def split_von_last(words):
"""Split "von Last" name into von and Last parts."""
if len(words) > 1 and is_capitalized(words[0]) is False:
for j, word in enumerate(reversed(words[:-1])):
if is_capitalized(word) not in (True, None):
return words[: -j - 1], words[-j - 1 :]
return [], words
def lookahead_iter(iterable):
"""Iterator that also yields the next item along with each item. The next
item is `None` when yielding the last item.
"""
items = iter(iterable)
item = next(items)
for next_item in items:
yield item, next_item
item = next_item
yield item, None
EN_DASH = unicodedata.lookup("EN DASH")