4b673482eaac — Phillip Alday tip 4 years ago
recent features from futhark git (#922c76)
5 files changed, 139 insertions(+), 69 deletions(-)

A => environment.yml
M extractbib.py
M hg-diff-md.py
M readme.md
A => requirements.txt
A => environment.yml +14 -0
@@ 0,0 1,14 @@ 
+name: futhark
+channels:
+  - defaults
+dependencies:
+  - pandoc>=2.0
+  - pip
+  - python>=3.5
+  - pip:
+    - bibtexparser==1.2.0
+    - pandoc-eqnos==2.5.0
+    - pandoc-fignos==2.4.0
+    - pandoc-tablenos==2.3.0
+    - pandoc-xnos==2.5.0
+    - pandocfilters==1.4.3

          
M extractbib.py +78 -60
@@ 1,18 1,38 @@ 
 #! /usr/bin/env python
-# s. https://gist.github.com/palday/1ff12dd110255541df0f
-# adapted from
-# GitHub Gist https://gist.github.com/tpoisot/7406955
-# don't forget to install bibtexparser: http://bibtexparser.readthedocs.org/en/latest/install.html
-# or with pip:
-# pip install bibtexparser
+# Copyright (c) 2014-2021, Phillip Alday
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>
 
+import argparse
 import sys
-import codecs
-from bibtexparser.bparser import BibTexParser, logger
+import bibtexparser
+
+import logging
 from logging import NullHandler
-logger.addHandler(NullHandler())
+logging.basicConfig(format='\033[1m\033[33m%(levelname)s:\033[0m %(message)s')
 
-non_local_fields = ['address',
+from bibtexparser.bparser import BibTexParser, logger
+from bibtexparser.bibdatabase import BibDatabase
+from bibtexparser.bwriter import BibTexWriter
+from bibtexparser.customization import convert_to_unicode, homogenize_latex_encoding
+logger.setLevel(logging.ERROR)
+
+# fix missing standard types
+bibtexparser.bibdatabase.STANDARD_TYPES.add("collection")
+bibtexparser.bibdatabase.STANDARD_TYPES.add("periodical")
+
+NON_LOCAL_FIELDS = ['address',
                     'annote',
                     'author',
                     'booktitle',

          
@@ 38,59 58,57 @@ non_local_fields = ['address',
                     'link',
                     'volume',
                     'year',
-                    'eprint',
-                    'eprintclass',
-                    'eprinttype',
-                    'date'
                   ]
 
-def dict2bib(ke,di):
-   # it seems the type field changed between different bibtexparser versions
-   try:
-      b = "@"+di['type'].upper()+"{"+ke+",\n"
-   except KeyError:
-      b = "@"+di['ENTRYTYPE'].upper()+"{"+ke+",\n"
+def prune(entry):
+   """
+      prune(entry)
+
+   Remove local fields from a BibTeX entry.
+
+   Local fields include things like "date-added" and references to document
+   storage.
+
+   This function uses `NON_LOCAL_FIELDS` as a whitelist, instead of
+   blacklisting local fields.
+   """
+   keepers = NON_LOCAL_FIELDS + ['ID', 'ENTRYTYPE'] # bibtexparser fields
+   return {field:value for field, value in entry.items() if field in keepers}
 
-   try:
-      items = di.iteritems()
-   except AttributeError:
-      items = di.items()
+argparser = argparse.ArgumentParser(
+   description="Extract minimal BibTeX entries from a large bibliography")
+argparser.add_argument('keylist', type=open,
+   help="Filename of a newline delimited list of BibTeX keys")
+argparser.add_argument('bibfile', type=open,
+   help="BibTeX file to extract entries from")
+argparser.add_argument('outfile', type=argparse.FileType('w', encoding='UTF-8'),
+   help="Destination file for extracted keys (will be overwritten")
+# TODO expose addition bibtexparser options, e.g.
+# parser = BibTexParser(common_strings=False)
+# parser.ignore_nonstandard_types = False
+# parser.homogenise_fields = False
+# allow for more verbose logging
+
+def main(argv=None):
+    args = argparser.parse_args(argv)
 
-   for (k, v) in sorted(items):
-      if k.lower().strip() in non_local_fields:
-         if k == 'link':
-            k = 'url'
-         b += '\t' + k + ' = {'+v+'},\n'
-   b += '}\n'
-   return b
+    keys = [_.rstrip() for _ in args.keylist.readlines()]
+
+    bibparser = BibTexParser(common_strings=True)
+    bibparser.customization = convert_to_unicode
+
+    allrefs = bibtexparser.load(args.bibfile, parser=bibparser).get_entry_dict()
+    usedrefs = BibDatabase()
+    usedrefs.entries = [prune(allrefs[key]) for key in keys if key in allrefs]
+
+    missing = [key for key in keys if key not in allrefs]
+
+    if missing:
+        logging.warning("Following keys not found: {}".format(', '.join(missing)))
+
+    writer = BibTexWriter()
+    #writer.indent = ' ' * 4
+    args.outfile.write(writer.write(usedrefs))
 
 if __name__ == "__main__":
-   ## Check the number of arguments
-   if len(sys.argv) != 4:
-      raise ValueError("Wrong number of arguments")
-   else :
-      key_list = sys.argv[1]
-      bib_file = sys.argv[2]
-      out_file = sys.argv[3]
-   ## The three arguments should be strings
-   if not isinstance(key_list, str):
-      raise TypeError("The path to the list of keys should be a string")
-   if not isinstance(bib_file, str):
-      raise TypeError("The path to the bibtex library should be a string")
-   if not isinstance(out_file, str):
-      raise TypeError("The path to the output bibtex file should be a string")
-   ## Step 1 - read the key list
-   keys = [kl.rstrip(":\n") for kl in open(key_list, 'r')]
-   ## Step 2 - read the library file
-   refs = BibTexParser(open(bib_file, 'r').read()).get_entry_dict()
-   ## Step 3 - extract the used entries
-   used_refs = {key: refs[key] for key in keys if key in refs}
-   ## Step 4 - convert the dicts back into bibtex
-   try:
-      used_refs_iter = used_refs.iteritems()
-   except AttributeError:
-      used_refs_iter = used_refs.items()
-   refs_as_bib = [dict2bib(k, v) for (k, v) in used_refs_iter ]
-   ## Step 5 - write the output file
-   with codecs.open(out_file, 'w', 'utf-8-sig') as of:
-      of.writelines(refs_as_bib)
+   sys.exit(main())

          
M hg-diff-md.py +17 -1
@@ 1,4 1,20 @@ 
-#! /bin/env python3
+#! /usr/bin/env python3
+"""
+Copyright (c) 2018, Phillip Alday
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
 
 import sys
 import argparse

          
M readme.md +20 -8
@@ 1,6 1,3 @@ 
-Futhark
-=========
-
 Markdown and DVCS can change the way we collaborate. Even for those pesky journals that require a submission in Word format, you can do the majority of your work in Markdown and then convert to Word via pandoc (and if need be LibreOffice to convert .odt/.docx to .doc). However, a common bibliography is still difficult to pull off. The Makefile and scripts here automagically extract the relevant pandoc references from the specified BibTeX library and create a minimal local BibTeX file that you can include in your repository.
 
 This was inspired by and expands upon the ideas in [this blog post](http://timotheepoisot.fr/2013/11/10/shared-bibtex-file-markdown/).

          
@@ 10,9 7,24 @@ Requirements:
 - `make`
 - [Mercurial](http://mercurial.selenic.com/) ([git version available on GitHub](https://github.com/palday/futhark))
 - [`pandoc`](http://johnmacfarlane.net/pandoc/)
--  [Python](https://www.python.org/)
-    - currently using Python 2.x because of
-    - [BibTexParser](http://bibtexparser.readthedocs.org/en/latest/install.html)
+- [Python (3)](https://www.python.org/)
+- [BibTexParser](http://bibtexparser.readthedocs.org/en/latest/install.html)
+
+You can install `pandoc` and all the Python tooling via [conda](https://www.anaconda.com/products/individual):
+```bash
+user@host:~/projectdir$ conda env create -f environment.yml
+user@host:~/projectdir$ conda activate futhark
+(futhark) user@host:~/projectdir$ conda activate futhark
+```
 
-License:
-My contributions are currently GPLv2, but I am building on the work of others, whose licensing conditions aren't yet clear. The LaTeX template is a modification of the standard pandoc template and is thus subject to [the same restrictions](https://github.com/jgm/pandoc-templates).
  No newline at end of file
+If you prefer to use your system Python, virtual environments, PyEnv, etc., then the Python packages are available via pip:
+```bash
+user@host:~/projectdir$ python -m pip install -r requirements.txt
+```
+
+The `mdwc` is useful for word counts of markdown documents with a YAML header block and is developed [here](https://github.com/palday/mdwc).
+
+## License
+My contributions are GPLv2. Previous versions used code with an unclear license, but that has been removed in current versions.
+
+The LaTeX template is a modification of the standard pandoc template and is thus subject to [the same restrictions](https://github.com/jgm/pandoc-templates).

          
A => requirements.txt +10 -0
@@ 0,0 1,10 @@ 
+bibtexparser==1.2.0
+certifi==2020.12.5
+future==0.18.2
+pandoc-eqnos==2.5.0
+pandoc-fignos==2.4.0
+pandoc-tablenos==2.3.0
+pandoc-xnos==2.5.0
+pandocfilters==1.4.3
+psutil==5.8.0
+pyparsing==2.4.7