M .hgignore +1 -0
@@ 4,5 4,6 @@ syntax: glob
dist-newstyle
test/*.log
test/*.aux
+test/*.out
stack*.yaml.lock
test/auto
M CHANGELOG.md +5 -0
@@ 1,5 1,10 @@
# Revision history for pdftotext
+## 0.0.2.0 -- 2020-06-11
+
+* Added PDF document properties (author, title etc.)
+* Added flag `xml-conduit` (parse metadata using `xml-conduit`)
+
## 0.0.1.0 -- 2020-05-10
* First version.
M README.md +6 -0
@@ 14,6 14,12 @@ main = do
T.putStrLn $ pdftotext Physical pdf
```
+## Flags
+
+### `xml-conduit`
+
+`pdftotext` can extract properties from PDF document. One of them is metadata which is in form of XML document. If `xml-conduit` flag is set then the metadata is parsed using `xml-conduit` package, otherwise provided as text.
+
## Internals
The library uses poppler via FFI, therefore internally all functions are of type `IO`. However, their non-`IO` variants (using `unsafePerformIO`) _should be_ safe to use. Module `Pdftotext.Internal` exposes all `IO`-typed functions.
M cbits/poppler.cc +36 -7
@@ 5,6 5,11 @@
extern "C" {
+ std::string* to_stdstring(poppler::ustring ust) {
+ std::vector<char> vc = ust.to_utf8();
+ return new std::string(vc.begin(), vc.end());
+ }
+
poppler::document* poppler_document_open_pdf(const char* file) {
poppler::document* doc = poppler::document::load_from_file(file);
return doc;
@@ 24,6 29,34 @@ extern "C" {
return doc->pages();
}
+ std::string* poppler_document_author(poppler::document* doc) {
+ return to_stdstring(doc->get_author());
+ }
+
+ std::string* poppler_document_creator(poppler::document* doc) {
+ return to_stdstring(doc->get_creator());
+ }
+
+ std::string* poppler_document_producer(poppler::document* doc) {
+ return to_stdstring(doc->get_producer());
+ }
+
+ std::string* poppler_document_subject(poppler::document* doc) {
+ return to_stdstring(doc->get_subject());
+ }
+
+ std::string* poppler_document_title(poppler::document* doc) {
+ return to_stdstring(doc->get_title());
+ }
+
+ std::string* poppler_document_keywords(poppler::document* doc) {
+ return to_stdstring(doc->get_keywords());
+ }
+
+ std::string* poppler_document_metadata(poppler::document* doc) {
+ return to_stdstring(doc->metadata());
+ }
+
poppler::page* poppler_document_open_page(poppler::document* doc, int page) {
return doc->create_page(page);
}
@@ 32,19 65,15 @@ extern "C" {
std::vector<char> vc;
switch (layout) {
case 0: {
- vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout).to_utf8();
- break;
+ return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout));
}
case 1: {
- vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout).to_utf8();
- break;
+ return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout));
}
default: {
- vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout).to_utf8();
- break;
+ return to_stdstring(page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout));
}
}
- return new std::string(vc.begin(), vc.end());
}
void poppler_page_delete(poppler::page* page) {
M pdftotext.cabal +8 -1
@@ 1,7 1,7 @@
cabal-version: >=1.10
name: pdftotext
-version: 0.0.1.0
+version: 0.0.2.0
synopsis: Extracts text from PDF using poppler
description: The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
homepage: https://sr.ht/~geyaeb/haskell-pdftotext/
@@ 19,6 19,10 @@ source-repository head
type: mercurial
location: https://hg.sr.ht/~geyaeb/haskell-pdftotext
+flag xml-conduit
+ description: Parse metadata of PDF document properties using xml-conduit
+ default: False
+
library
exposed-modules: Pdftotext
, Pdftotext.Foreign
@@ 34,6 38,9 @@ library
, cbits/stdstring.cc
extra-libraries: stdc++
pkgconfig-depends: poppler-cpp
+ if flag(xml-conduit)
+ build-depends: xml-conduit == 1.8.*
+ cpp-options: -DXMLC
test-suite pdftotext-test
default-language: Haskell2010
M src/Pdftotext.hs +12 -0
@@ 20,6 20,10 @@ Portability : POSIX
> Just pdf <- openFile "path/to/file.pdf"
> T.putStrLn $ pdftotext Physical pdf
+=== Flags
+
+* @xml-conduit@ – 'metadata' of PDF document properties is parsed as XML, otherwise remains as text
+
-}
{- ORMOLU_ENABLE -}
module Pdftotext
@@ 27,6 31,7 @@ module Pdftotext
Document,
Layout (..),
Page,
+ Properties (..),
-- * Loading PDF's
openByteString,
@@ 37,6 42,7 @@ module Pdftotext
pages,
pagesTotal,
pdftotext,
+ properties,
-- * Page functions
pageNumber,
@@ 67,6 73,12 @@ pages = unsafePerformIO . pagesIO
pagesTotal :: Document -> Int
pagesTotal = unsafePerformIO . pagesTotalIO
+-- | Extract properties from the document.
+--
+-- @since 0.0.2.0
+properties :: Document -> Properties
+properties = unsafePerformIO . propertiesIO
+
-- | Extract text from PDF document with given 'Layout'.
pdftotext :: Layout -> Document -> Text
pdftotext lay doc = unsafePerformIO $ pdftotextIO lay doc
M src/Pdftotext/Foreign.hs +29 -1
@@ 24,9 24,16 @@ module Pdftotext.Foreign
-- * FFI
ffiOpenPdf,
ffiOpenData,
+ ffiDocumentAuthor,
+ ffiDocumentCreator,
ffiDocumentDelete,
+ ffiDocumentKeywords,
+ ffiDocumentMetadata,
+ ffiDocumentOpenPage,
ffiDocumentPages,
- ffiDocumentOpenPage,
+ ffiDocumentProducer,
+ ffiDocumentSubject,
+ ffiDocumentTitle,
ffiPageDelete,
ffiPageText,
ffiStringLength,
@@ 60,6 67,27 @@ foreign import ccall unsafe "&poppler_do
foreign import ccall unsafe "poppler_document_pages"
ffiDocumentPages :: Ptr Poppler_Document -> IO CInt
+foreign import ccall unsafe "poppler_document_author"
+ ffiDocumentAuthor :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_creator"
+ ffiDocumentCreator :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_producer"
+ ffiDocumentProducer :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_subject"
+ ffiDocumentSubject :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_title"
+ ffiDocumentTitle :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_keywords"
+ ffiDocumentKeywords :: Ptr Poppler_Document -> IO StdString
+
+foreign import ccall unsafe "poppler_document_metadata"
+ ffiDocumentMetadata :: Ptr Poppler_Document -> IO StdString
+
foreign import ccall unsafe "poppler_document_open_page"
ffiDocumentOpenPage :: Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
M src/Pdftotext/Internal.hs +56 -0
@@ 1,4 1,5 @@
{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE CPP #-}
{- ORMOLU_DISABLE -}
{-|
@@ 18,6 19,7 @@ module Pdftotext.Internal
Document (..),
Layout (..),
Page (..),
+ Properties (..),
-- * Loading PDF's
openByteStringIO,
@@ 28,6 30,7 @@ module Pdftotext.Internal
pagesIO,
pagesTotalIO,
pdftotextIO,
+ propertiesIO,
-- * Page functions
pageTextIO,
@@ 41,8 44,33 @@ import Foreign (ForeignPtr, newForeignPt
import Foreign.C (withCString)
import Pdftotext.Foreign
+#ifdef XMLC
+import qualified Text.XML as X
+import qualified Data.Text.Lazy as TL
+#endif
+
newtype Document = Document (ForeignPtr Poppler_Document)
+-- | Document properties.
+--
+-- If flag @xml-conduit@ is set, 'metadata' is of type @Maybe Text.XML.Document@.
+--
+-- @since 0.0.2.0
+data Properties = Properties
+ { author :: Maybe T.Text,
+ creator :: Maybe T.Text,
+ keywords :: Maybe T.Text,
+#ifdef XMLC
+ metadata :: Maybe X.Document,
+#else
+ metadata :: Maybe T.Text,
+#endif
+ producer :: Maybe T.Text,
+ subject :: Maybe T.Text,
+ title :: Maybe T.Text
+ }
+ deriving (Show)
+
data Page = Page
{ -- | Number of this page in original document.
pageNumber :: Int,
@@ 119,6 147,34 @@ pageTextIO layout (Page _ _ ptr) = withF
Physical -> 1
None -> 2
+-- | Extract properties from the document.
+-- @since 0.0.2.0
+propertiesIO :: Document -> IO Properties
+propertiesIO (Document docptr) = withForeignPtr docptr \doc -> do
+ a <- asText $ ffiDocumentAuthor doc
+ c <- asText $ ffiDocumentCreator doc
+ k <- asText $ ffiDocumentKeywords doc
+ m <- asText $ ffiDocumentMetadata doc
+ p <- asText $ ffiDocumentProducer doc
+ s <- asText $ ffiDocumentSubject doc
+ t <- asText $ ffiDocumentTitle doc
+
+#ifdef XMLC
+ return $ Properties (f a) (f c) (f k) (xml m) (f p) (f s) (f t)
+ where
+ xml x =
+ if T.null x
+ then Nothing
+ else either (const Nothing) Just $ X.parseText X.def (TL.fromStrict x)
+#else
+ return $ Properties (f a) (f c) (f k) (f m) (f p) (f s) (f t)
+ where
+#endif
+ f x =
+ if T.null x
+ then Nothing
+ else Just x
+
-- | Extract text from PDF document with given 'Layout'.
pdftotextIO :: Layout -> Document -> IO T.Text
pdftotextIO layout doc = do
M test/PdftotextSpec.hs +11 -0
@@ 1,5 1,6 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
module PdftotextSpec (spec) where
@@ 30,3 31,13 @@ spec = do
pagesTotal doc `shouldBe` 4
it "should contain correct number of pages (`pages`)" \(Just doc) ->
length (pages doc) `shouldBe` 4
+
+ describe "PDF properties" do
+ it "should contain all fields" \(Just doc) -> do
+ let Properties {..} = properties doc
+ author `shouldBe` Just "G. Eyaeb"
+ title `shouldBe` Just "Simple document for testing"
+ creator `shouldBe` Just "pdflatex"
+ producer `shouldBe` Just "LaTeX with hyperref"
+ keywords `shouldBe` Just "haskell,pdf"
+ subject `shouldBe` Just "Testing"
M test/simple.pdf +0 -0
M test/simple.tex +10 -0
@@ 1,5 1,15 @@
\documentclass{article}
+\usepackage[
+pdftex,
+pdfauthor={G. Eyaeb},
+pdfsubject={Testing},
+pdftitle={Simple document for testing},
+pdfkeywords={haskell,pdf},
+pdfproducer={LaTeX with hyperref},
+pdfcreator={pdflatex}
+]{hyperref}
+
\title{Simple document for testing}
\date{}
\pagenumbering{gobble}