Source code for dendropy.utility.bibtex

#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
BibTeX interface.
"""

import re
import textwrap
import sys
from dendropy.utility.container import OrderedCaselessDict

# default order of fields
BIBTEX_FIELDS = [
    'author',
    'year',
    'title',
    'journal',
    'volume',
    'number',
    'editor',
    'booktitle',
    'series',
    'chapter',
    'pages',
    'publisher',
    'institution',
    'address',
    'annote',
    'edition',
    'howpublished',
    'key',
    'month',
    'organization',
    'school',
    'note',
    'abstract',
    'keywords',
    'localfile',
    ]

def _clean_parsed_text(text):
    """
    Strips outer quotes, curly braces, etc.; remove multiple
    consecutive whitespaces, etc.
    """
    if text.startswith('{') and text.endswith('}'):
        text = text[1:-1]
    elif text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    text = re.sub("[\s]+", " ", text).strip()
    return text

def _format_bibtex_value(text, col_start=1, wrap=True, width=78):
    """
    Formats text of a BibTeX field.
    """
    ftext = re.sub("[\s]+", " ", text).strip()
    col_indent = " " * col_start
    if not ftext[0].isdigit():
        if wrap:
            initial_indent = '{'
            subsequent_indent = ' '
        else:
            ftext = '{' + ftext
        ftext = ftext + '}'
    else:
        initial_indent = ''
        subsequent_indent = ''
    if wrap:
        wrapped = textwrap.wrap(ftext,
                                width=width,
                                initial_indent=initial_indent,
                                subsequent_indent=subsequent_indent)
        for index, line in enumerate(wrapped[1:]):
            wrapped[index+1] = (" " * col_start) + wrapped[index+1]
        return '\n'.join(wrapped)
    else:
        return ftext

[docs]class BibTexEntry(object): """ Tracks a single BibTeX entry. """ decompose_pattern = re.compile(r'^@(\w*)\s*{\s*([\w|\:|\-]*),(.*)}') # works, but misses last field field_pattern = re.compile(r'\s*([\w|\-]*?)\s*=\s*(.*?),(?=\s*[\w|\-]*\s*\=)') # get the last field last_field_pattern = re.compile(r'\s*([\w|\-]*?)\s*=\s*(.*?)\s*[,]*\s*$') def __init__(self, citation=None): """ Sets up internal dictionary of BibTeX fields, and initializes if argument is given. """ self.bibtype = None self.citekey = None if isinstance(citation, BibTexEntry): self._entry_dict = OrderedCaselessDict(citation._entry_dict) elif isinstance(citation, dict): self._entry_dict = OrderedCaselessDict() for k, v in citation.items(): self._entry_dict[k.lower()] = v self.bibtype = self._entry_dict.get("bibtype", None) self.citekey = self._entry_dict.get("citekey", None) else: self._entry_dict = OrderedCaselessDict() self.parse_text(citation)
[docs] def __getattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': return entry_dict elif name == '__dict__': return object.__getattribute__(self, '__dict__') elif name == 'bibtype' and hasattr(self, 'bibtype'): return object.__getattribute__(self, '__dict__')['bibtype'] elif name == 'citekey' and hasattr(self, 'citekey'): return object.__getattribute__(self, '__dict__')['citekey'] elif name in entry_dict: return entry_dict[name] elif name in BIBTEX_FIELDS: return "" else: raise AttributeError(name)
[docs] def __setattr__(self, name, value): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': entry_dict = value elif name == 'bibtype' or name == 'citekey': object.__setattr__(self, name, value) else: self._entry_dict[name] = value
[docs] def __delattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': object.__delattr__(self, '_entry_dict') elif name in entry_dict: del(entry_dict[name]) elif name in BIBTEX_FIELDS: pass elif name in object.__getattribute__(self, '__dict__'): object.__delattr__(name) else: raise AttributeError(name)
[docs] def __str__(self): """ String representation of self. """ return self.as_bibtex()
[docs] def __repr__(self): """ Internal representation of self. """ repr_dict = {} repr_dict['bibtype'] = self.bibtype repr_dict['citekey'] = self.citekey repr_dict.update(self.fields_as_dict()) return repr_dict
def _get_entry_dict(self): """ Returns the internal field dictionary, creating it first if neccessary. """ if not hasattr(self, '_entry_dict'): object.__setattr__(self, '_entry_dict', {}) return object.__getattribute__(self, '_entry_dict') def _get_fields(self): """ Returns list of populated fields in order (does not include bibtype and citekey). """ fields = [] for field in BIBTEX_FIELDS: if field in self._entry_dict: fields.append(field) for key in self._entry_dict: if key not in fields: fields.append(key) return fields fields = property(_get_fields)
[docs] def parse_text(self, text): """ Parses a BibTeX text entry. """ text = text.replace("\n", "") self.bibtype = None self.citekey = None text = text.strip() decompose_match = self.decompose_pattern.match(text) try: self.bibtype = decompose_match.group(1) except AttributeError as exception: raise ValueError("Failed to parse bibtype: {}".format(text)) try: self.citekey = decompose_match.group(2) except AttributeError as exception: raise ValueError("Failed to parse citekey: {}".format(text)) remaining = decompose_match.group(3) field_match = self.field_pattern.match(remaining) while field_match: field_match = self.field_pattern.match(remaining) if field_match: field_name = field_match.group(1).lower() field_value = _clean_parsed_text(field_match.group(2)) self._entry_dict[field_name] = field_value remaining = remaining.replace(field_match.group(), '') if remaining: last_field_match = self.last_field_pattern.match(remaining) if last_field_match: field_name = last_field_match.group(1).lower() field_value = _clean_parsed_text(last_field_match.group(2)) self._entry_dict[field_name] = field_value
[docs] def fields_as_dict(self): """ Returns the fields (i.e., all public attributes except for bibtype and citekey as a dictionary). """ return dict(self._entry_dict)
[docs] def as_bibtex(self, wrap_width=78): """ Composes entry in BibTex format. """ entry = [] sep = " = " entry.append('@{}{{},'.format((self.bibtype, self.citekey))) fields = self.fields # maxlen = max([len(field) for field in fields]) maxlen = max([len(field) for field in BIBTEX_FIELDS]) for field in fields: if field != 'url': wrap = True else: wrap = False field_header = field.ljust(maxlen) field_value = _format_bibtex_value(self._entry_dict[field], wrap=wrap, width = wrap_width - maxlen - len(sep) + 2, col_start = maxlen + len(sep) + 2 ) entry.append(" {}{}{},".format((field_header, sep, field_value))) entry.append('}') return '\n'.join(entry)
[docs] def as_compact_bibtex(self): """ Composes entry in BibTex format. """ entry = [] entry.append('@{}{{{},'.format((self.bibtype, self.citekey))) fields = self.fields for field in fields: field_value = _format_bibtex_value(self._entry_dict[field], wrap=False, width=None, col_start=1) entry.append("{}={},".format((field, field_value))) entry.append('}') return ''.join(entry)
class BibTexDb(object): """ A BibTeX database. """ def __init__(self, bibfilepath=None): """ Initializes database, optionally from file. """ self.entries = [] if bibfilepath: self.add_from_file(bibfilepath) def add_from_file(self, filepath): """ Reads and loads a BibTeX database file. """ bibfile = open(filepath, 'r') self.add_from_text(bibfile.read()) bibfile.close() def add_from_text(self, text): """ Loads from text. """ text = text.replace('\n','') entry_pattern = re.compile(r'@\w*{([^{}]+{[^{}]*({[^{}]*}[^{}]*)*})+}') for match in entry_pattern.finditer(text): entry = BibTexEntry(match.group()) self.entries.append(entry)