Add PDF document properties
M .hgignore +1 -0
@@ 4,5 4,6 @@ syntax: glob
 dist-newstyle
 test/*.log
 test/*.aux
+test/*.out
 stack*.yaml.lock
 test/auto

          
M CHANGELOG.md +5 -0
@@ 1,5 1,10 @@ 
 # Revision history for pdftotext
 
+## 0.0.2.0 -- 2020-06-11
+
+* Added PDF document properties (author, title etc.)
+* Added flag `xml-conduit` (parse metadata using `xml-conduit`)
+
 ## 0.0.1.0 -- 2020-05-10
 
 * First version.

          
M README.md +6 -0
@@ 14,6 14,12 @@ main = do
   T.putStrLn $ pdftotext Physical pdf
 ```
 
+## Flags
+
+### `xml-conduit`
+
+`pdftotext` can extract properties from PDF document. One of them is metadata which is in form of XML document. If `xml-conduit` flag is set then the metadata is parsed using `xml-conduit` package, otherwise provided as text.
+
 ## Internals
 
 The library uses poppler via FFI, therefore internally all functions are of type `IO`. However, their non-`IO` variants (using `unsafePerformIO`) _should be_ safe to use. Module `Pdftotext.Internal` exposes all `IO`-typed functions.

          
M cbits/poppler.cc +36 -7
@@ 5,6 5,11 @@ 
 
 extern "C" {
 
+  std::string* to_stdstring(poppler::ustring ust) {
+    std::vector<char> vc = ust.to_utf8();
+    return new std::string(vc.begin(), vc.end());
+  }
+  
   poppler::document* poppler_document_open_pdf(const char* file) {
     poppler::document* doc = poppler::document::load_from_file(file);
     return doc;

          
@@ 24,6 29,34 @@ extern "C" {
     return doc->pages();
   }
 
+  std::string* poppler_document_author(poppler::document* doc) {
+    return to_stdstring(doc->get_author());
+  }
+
+  std::string* poppler_document_creator(poppler::document* doc) {
+    return to_stdstring(doc->get_creator());
+  }
+
+  std::string* poppler_document_producer(poppler::document* doc) {
+    return to_stdstring(doc->get_producer());
+  }
+
+  std::string* poppler_document_subject(poppler::document* doc) {
+    return to_stdstring(doc->get_subject());
+  }
+
+  std::string* poppler_document_title(poppler::document* doc) {
+    return to_stdstring(doc->get_title());
+  }
+
+  std::string* poppler_document_keywords(poppler::document* doc) {
+    return to_stdstring(doc->get_keywords());
+  }
+
+  std::string* poppler_document_metadata(poppler::document* doc) {
+    return to_stdstring(doc->metadata());
+  }
+
   poppler::page* poppler_document_open_page(poppler::document* doc, int page) {
     return doc->create_page(page);
   }

          
@@ 32,19 65,15 @@ extern "C" {
     std::vector<char> vc;
     switch (layout) {
     case 0: { 
-      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout).to_utf8();
-      break;
+      return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout));
     }
     case 1: {
-      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout).to_utf8();
-      break;
+      return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout));
     }
     default: {
-      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout).to_utf8();
-      break;
+      return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout));
     }
     }
-    return new std::string(vc.begin(), vc.end());
   }
 
   void poppler_page_delete(poppler::page* page) {

          
M pdftotext.cabal +8 -1
@@ 1,7 1,7 @@ 
 cabal-version:       >=1.10
 
 name:                pdftotext
-version:             0.0.1.0
+version:             0.0.2.0
 synopsis:            Extracts text from PDF using poppler
 description:         The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
 homepage:            https://sr.ht/~geyaeb/haskell-pdftotext/

          
@@ 19,6 19,10 @@ source-repository head
   type:                mercurial
   location:            https://hg.sr.ht/~geyaeb/haskell-pdftotext
 
+flag xml-conduit
+  description:         Parse metadata of PDF document properties using xml-conduit
+  default:             False
+  
 library
   exposed-modules:     Pdftotext
                      , Pdftotext.Foreign

          
@@ 34,6 38,9 @@ library
                      , cbits/stdstring.cc
   extra-libraries:     stdc++
   pkgconfig-depends:   poppler-cpp
+  if flag(xml-conduit)
+     build-depends:    xml-conduit == 1.8.*
+     cpp-options:      -DXMLC
 
 test-suite pdftotext-test
   default-language:    Haskell2010

          
M src/Pdftotext.hs +12 -0
@@ 20,6 20,10 @@ Portability : POSIX
 >   Just pdf <- openFile "path/to/file.pdf"
 >   T.putStrLn $ pdftotext Physical pdf
 
+=== Flags
+
+* @xml-conduit@ – 'metadata' of PDF document properties is parsed as XML, otherwise remains as text
+
 -}
 {- ORMOLU_ENABLE -}
 module Pdftotext

          
@@ 27,6 31,7 @@ module Pdftotext
     Document,
     Layout (..),
     Page,
+    Properties (..),
 
     -- * Loading PDF's
     openByteString,

          
@@ 37,6 42,7 @@ module Pdftotext
     pages,
     pagesTotal,
     pdftotext,
+    properties,
 
     -- * Page functions
     pageNumber,

          
@@ 67,6 73,12 @@ pages = unsafePerformIO . pagesIO
 pagesTotal :: Document -> Int
 pagesTotal = unsafePerformIO . pagesTotalIO
 
+-- | Extract properties from the document.
+--
+-- @since 0.0.2.0
+properties :: Document -> Properties
+properties = unsafePerformIO . propertiesIO
+
 -- | Extract text from PDF document with given 'Layout'.
 pdftotext :: Layout -> Document -> Text
 pdftotext lay doc = unsafePerformIO $ pdftotextIO lay doc

          
M src/Pdftotext/Foreign.hs +29 -1
@@ 24,9 24,16 @@ module Pdftotext.Foreign
     -- * FFI
     ffiOpenPdf,
     ffiOpenData,
+    ffiDocumentAuthor,
+    ffiDocumentCreator,
     ffiDocumentDelete,
+    ffiDocumentKeywords,
+    ffiDocumentMetadata,
+    ffiDocumentOpenPage,
     ffiDocumentPages,
-    ffiDocumentOpenPage,
+    ffiDocumentProducer,
+    ffiDocumentSubject,
+    ffiDocumentTitle,
     ffiPageDelete,
     ffiPageText,
     ffiStringLength,

          
@@ 60,6 67,27 @@ foreign import ccall unsafe "&poppler_do
 foreign import ccall unsafe "poppler_document_pages"
   ffiDocumentPages :: Ptr Poppler_Document -> IO CInt
 
+foreign import ccall unsafe "poppler_document_author"
+  ffiDocumentAuthor :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_creator"
+  ffiDocumentCreator :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_producer"
+  ffiDocumentProducer :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_subject"
+  ffiDocumentSubject :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_title"
+  ffiDocumentTitle :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_keywords"
+  ffiDocumentKeywords :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_metadata"
+  ffiDocumentMetadata :: Ptr Poppler_Document -> IO StdString
+
 foreign import ccall unsafe "poppler_document_open_page"
   ffiDocumentOpenPage :: Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
 

          
M src/Pdftotext/Internal.hs +56 -0
@@ 1,4 1,5 @@ 
 {-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE CPP #-}
 
 {- ORMOLU_DISABLE -}
 {-|

          
@@ 18,6 19,7 @@ module Pdftotext.Internal
     Document (..),
     Layout (..),
     Page (..),
+    Properties (..),
 
     -- * Loading PDF's
     openByteStringIO,

          
@@ 28,6 30,7 @@ module Pdftotext.Internal
     pagesIO,
     pagesTotalIO,
     pdftotextIO,
+    propertiesIO,
 
     -- * Page functions
     pageTextIO,

          
@@ 41,8 44,33 @@ import Foreign (ForeignPtr, newForeignPt
 import Foreign.C (withCString)
 import Pdftotext.Foreign
 
+#ifdef XMLC
+import qualified Text.XML as X
+import qualified Data.Text.Lazy as TL
+#endif
+
 newtype Document = Document (ForeignPtr Poppler_Document)
 
+-- | Document properties.
+--
+-- If flag @xml-conduit@ is set, 'metadata' is of type @Maybe Text.XML.Document@.
+--
+-- @since 0.0.2.0
+data Properties = Properties
+  { author :: Maybe T.Text,
+    creator :: Maybe T.Text,
+    keywords :: Maybe T.Text,
+#ifdef XMLC
+    metadata :: Maybe X.Document,
+#else
+    metadata :: Maybe T.Text,
+#endif
+    producer :: Maybe T.Text,
+    subject :: Maybe T.Text,
+    title :: Maybe T.Text
+  }
+  deriving (Show)
+
 data Page = Page
   { -- | Number of this page in original document.
     pageNumber :: Int,

          
@@ 119,6 147,34 @@ pageTextIO layout (Page _ _ ptr) = withF
         Physical -> 1
         None -> 2
 
+-- | Extract properties from the document.
+-- @since 0.0.2.0
+propertiesIO :: Document -> IO Properties
+propertiesIO (Document docptr) = withForeignPtr docptr \doc -> do
+  a <- asText $ ffiDocumentAuthor doc
+  c <- asText $ ffiDocumentCreator doc
+  k <- asText $ ffiDocumentKeywords doc
+  m <- asText $ ffiDocumentMetadata doc
+  p <- asText $ ffiDocumentProducer doc
+  s <- asText $ ffiDocumentSubject doc
+  t <- asText $ ffiDocumentTitle doc
+
+#ifdef XMLC
+  return $ Properties (f a) (f c) (f k) (xml m) (f p) (f s) (f t)
+  where
+    xml x =
+      if T.null x
+      then Nothing
+      else either (const Nothing) Just $ X.parseText X.def (TL.fromStrict x)
+#else
+  return $ Properties (f a) (f c) (f k) (f m) (f p) (f s) (f t)
+  where
+#endif
+    f x =
+      if T.null x
+      then Nothing
+      else Just x
+
 -- | Extract text from PDF document with given 'Layout'.
 pdftotextIO :: Layout -> Document -> IO T.Text
 pdftotextIO layout doc = do

          
M test/PdftotextSpec.hs +11 -0
@@ 1,5 1,6 @@ 
 {-# LANGUAGE BlockArguments #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
 
 module PdftotextSpec (spec) where
 

          
@@ 30,3 31,13 @@ spec = do
         pagesTotal doc `shouldBe` 4
       it "should contain correct number of pages (`pages`)" \(Just doc) ->
         length (pages doc) `shouldBe` 4
+
+    describe "PDF properties" do
+      it "should contain all fields" \(Just doc) -> do
+        let Properties {..} = properties doc
+        author `shouldBe` Just "G. Eyaeb"
+        title `shouldBe` Just "Simple document for testing"
+        creator `shouldBe` Just "pdflatex"
+        producer `shouldBe` Just "LaTeX with hyperref"
+        keywords `shouldBe` Just "haskell,pdf"
+        subject `shouldBe` Just "Testing"

          
M test/simple.pdf +0 -0

        
M test/simple.tex +10 -0
@@ 1,5 1,15 @@ 
 \documentclass{article}
 
+\usepackage[
+pdftex,
+pdfauthor={G. Eyaeb},
+pdfsubject={Testing},
+pdftitle={Simple document for testing},
+pdfkeywords={haskell,pdf},
+pdfproducer={LaTeX with hyperref},
+pdfcreator={pdflatex}
+]{hyperref}
+
 \title{Simple document for testing}
 \date{}
 \pagenumbering{gobble}