M CHANGELOG.md +5 -0
@@ 1,5 1,10 @@
# Revision history for pdftotext
+## 0.1.0.0 --
+
+* Added executable `pdftotext.hs`
+* Removed `xml-conduit` flag, it was bad idea, sorry for that
+
## 0.0.2.0 -- 2020-06-11
* Added PDF document properties (author, title etc.)
M README.md +30 -3
@@ 14,11 14,38 @@ main = do
T.putStrLn $ pdftotext Physical pdf
```
-## Flags
+## Executable
+
+`pdftotext` comes with executable program `pdftotext.hs` which can print text extracted from PDF and basic information from the document.
-### `xml-conduit`
+```shell
+$> pdftotext.hs info test/simple.pdf
+File : test/simple.pdf
+Pages : 4
+Properties
+ Title : Simple document for testing
+ Author : G. Eyaeb
+ Subject : Testing
+ Creator : pdflatex
+ Producer: LaTeX with hyperref
+ Keywords: haskell,pdf
+```
-`pdftotext` can extract properties from PDF document. One of them is metadata which is in form of XML document. If `xml-conduit` flag is set then the metadata is parsed using `xml-conduit` package, otherwise provided as text.
+```shell
+$> pdftotext.hs print --pages 1,4 test/simple.pdf
+Simple document for testing
+
+ deserve neither
+liberty nor safety.
+```
+
+See help for more information:
+
+```shell
+$> pdftotext.hs --help
+$> pdftotext.hs print --help
+$> pdftotext.hs info --help
+```
## Internals
A => cli/Cli.hs +155 -0
@@ 0,0 1,155 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
+
+module Main (main) where
+
+import Data.Aeson ((.=), ToJSON (..), defaultOptions, genericToJSON, object)
+import Data.Aeson.Text (encodeToLazyText)
+import Data.Bifunctor (first)
+import Data.List (sort)
+import Data.Maybe (catMaybes)
+import Data.Range (Range, fromRanges, intersection, lbi)
+import Data.Range.Parser (parseRanges)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.IO as TL
+import Options.Applicative
+import Pdftotext.Internal
+import qualified Text.PrettyPrint.ANSI.Leijen as P
+
+data Command
+ = Print PrintOptions
+ | Info InfoOptions
+
+data PrintOptions = PrintOptions
+ { prtFile :: FilePath,
+ prtPages :: [Range Int],
+ prtOutfile :: Maybe FilePath,
+ prtSeparate :: Bool,
+ prtColor :: Bool,
+ prtViewer :: Bool
+ }
+ deriving (Show)
+
+data InfoFormat = JsonFormat | PlainFormat deriving (Eq, Show)
+
+data InfoOptions = InfoOptions
+ { infFile :: FilePath,
+ infFormat :: InfoFormat
+ }
+ deriving (Show)
+
+data Information = Information
+ { iProperties :: Properties,
+ iFile :: FilePath,
+ iPages :: Int
+ }
+ deriving (Show)
+
+instance ToJSON Information where
+ toJSON Information {..} =
+ object
+ [ "file" .= iFile,
+ "pages" .= iPages,
+ "properties" .= genericToJSON defaultOptions iProperties
+ ]
+
+main :: IO ()
+main =
+ execParser
+ ( info
+ (commandParser <**> helper)
+ (fullDesc <> progDesc "Extract text from PDF")
+ )
+ >>= \case
+ Print opts -> printDocument opts
+ Info opts -> printInfo opts
+
+commandParser :: Parser Command
+commandParser =
+ hsubparser
+ ( command "print" (info printOptions (progDesc "Print extracted text" <> footer "RANGE: -3,5,7-12,15,20-"))
+ <> command "info" (info infoOptions (progDesc "Show information about document"))
+ )
+
+infoOptions :: Parser Command
+infoOptions =
+ fmap Info $
+ InfoOptions
+ <$> strArgument (metavar "FILE" <> help "PDF file")
+ <*> option format (long "format" <> short 'f' <> help "Output format (plain, json)" <> value PlainFormat)
+ where
+ format =
+ eitherReader \case
+ "json" -> Right JsonFormat
+ "plain" -> Right PlainFormat
+ f -> Left $ f ++ " is not a valid output format, use one of: plain, json"
+
+printOptions :: Parser Command
+printOptions =
+ fmap Print $
+ PrintOptions
+ <$> strArgument (metavar "FILE" <> help "PDF file")
+ <*> option range (long "pages" <> short 'p' <> help "Range of pages to process" <> metavar "RANGE" <> value [])
+ <*> pure Nothing -- switch (metavar "FILE" <> long "output" <> short 'o' <> help "Write output to file")
+ <*> pure False -- switch (long "separate" <> help "Separate pages")
+ <*> pure False -- switch (long "color" <> short "c" <> help "Use colors")
+ <*> pure False -- switch (long "viewer" <> short "v" <> help "Use internal viewer")
+ where
+ range = eitherReader (first show . parseRanges)
+
+printDocument :: PrintOptions -> IO ()
+printDocument PrintOptions {..} = do
+ f <- openFile prtFile
+ case f of
+ Just d -> do
+ pageNo <- pagesTotalIO d
+ pages <- mapM (flip pageIO d) (pageList pageNo prtPages)
+ txt <- mapM (pageTextIO Physical) (catMaybes pages)
+ T.putStrLn (T.concat txt)
+ _ -> putStrLn $ prtFile ++ " is not a valid PDF document"
+
+pageList :: Int -> [Range Int] -> [Int]
+pageList total [] = [1 .. total]
+pageList total ranges =
+ sort
+ $ filter (<= total)
+ $ take total
+ $ fromRanges
+ $ intersection [lbi 1] ranges
+
+printInfo :: InfoOptions -> IO ()
+printInfo InfoOptions {..} = do
+ f <- openFile infFile
+ case f of
+ Just d -> do
+ p <- propertiesIO d
+ pageno <- pagesTotalIO d
+ let i = Information p infFile pageno
+ case infFormat of
+ JsonFormat -> printInfoJson i
+ PlainFormat -> printInfoPlain i
+ _ -> putStrLn $ infFile ++ " is not a valid PDF document"
+
+printInfoJson :: Information -> IO ()
+printInfoJson p = TL.putStrLn (encodeToLazyText $ toJSON p)
+
+{- ORMOLU_DISABLE -}
+printInfoPlain :: Information -> IO ()
+printInfoPlain Information{..} =
+ P.putDoc $
+ P.text "File :" P.<+> P.text iFile P.<$>
+ P.text "Pages :" P.<+> P.text (show iPages) P.<$>
+ P.text "Properties" P.<$>
+ P.indent 2 (
+ P.text "Title :" P.<+> P.text (maybe "" T.unpack title) P.<$>
+ P.text "Author :" P.<+> P.text (maybe "" T.unpack author) P.<$>
+ P.text "Subject :" P.<+> P.text (maybe "" T.unpack subject) P.<$>
+ P.text "Creator :" P.<+> P.text (maybe "" T.unpack creator) P.<$>
+ P.text "Producer:" P.<+> P.text (maybe "" T.unpack producer) P.<$>
+ P.text "Keywords:" P.<+> P.text (maybe "" T.unpack keywords)
+ ) P.<> P.hardline
+ where Properties{..} = iProperties
+{- ORMOLU_ENABLE -}
M pdftotext.cabal +15 -9
@@ 1,7 1,7 @@
cabal-version: >=1.10
name: pdftotext
-version: 0.0.2.0
+version: 0.1.0.0
synopsis: Extracts text from PDF using poppler
description: The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
homepage: https://sr.ht/~geyaeb/haskell-pdftotext/
@@ 18,10 18,6 @@ extra-source-files: CHANGELOG.md, READM
source-repository head
type: mercurial
location: https://hg.sr.ht/~geyaeb/haskell-pdftotext
-
-flag xml-conduit
- description: Parse metadata of PDF document properties using xml-conduit
- default: False
library
exposed-modules: Pdftotext
@@ 38,9 34,19 @@ library
, cbits/stdstring.cc
extra-libraries: stdc++
pkgconfig-depends: poppler-cpp
- if flag(xml-conduit)
- build-depends: xml-conduit == 1.8.*
- cpp-options: -DXMLC
+
+executable pdftotext.hs
+ build-depends: base >= 4.11 && < 5
+ , aeson >= 1.4 && < 1.6
+ , ansi-wl-pprint == 0.6.*
+ , optparse-applicative == 0.15.*
+ , pdftotext
+ , range == 0.3.*
+ , text == 1.2.*
+ hs-source-dirs: cli
+ main-is: Cli.hs
+ ghc-options: -Wall -Wincomplete-uni-patterns -Wincomplete-record-updates -Wcompat -Widentities -Wredundant-constraints -fhide-source-paths -Wmissing-export-lists -Wpartial-fields
+ default-language: Haskell2010
test-suite pdftotext-test
default-language: Haskell2010
@@ 52,4 58,4 @@ test-suite pdftotext-test
hs-source-dirs: test
main-is: Spec.hs
other-modules: PdftotextSpec
- build-tool-depends: hspec-discover:hspec-discover == 2.*
No newline at end of file
+ build-tool-depends: hspec-discover:hspec-discover == 2.*
M src/Pdftotext/Internal.hs +6 -23
@@ 1,5 1,5 @@
{-# LANGUAGE BlockArguments #-}
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveGeneric #-}
{- ORMOLU_DISABLE -}
{-|
@@ 42,13 42,9 @@ import Data.ByteString.Internal
import qualified Data.Text as T
import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
import Foreign.C (withCString)
+import GHC.Generics
import Pdftotext.Foreign
-#ifdef XMLC
-import qualified Text.XML as X
-import qualified Data.Text.Lazy as TL
-#endif
-
newtype Document = Document (ForeignPtr Poppler_Document)
-- | Document properties.
@@ 60,16 56,12 @@ data Properties = Properties
{ author :: Maybe T.Text,
creator :: Maybe T.Text,
keywords :: Maybe T.Text,
-#ifdef XMLC
- metadata :: Maybe X.Document,
-#else
metadata :: Maybe T.Text,
-#endif
producer :: Maybe T.Text,
subject :: Maybe T.Text,
title :: Maybe T.Text
}
- deriving (Show)
+ deriving (Show, Generic)
data Page = Page
{ -- | Number of this page in original document.
@@ 148,6 140,7 @@ pageTextIO layout (Page _ _ ptr) = withF
None -> 2
-- | Extract properties from the document.
+--
-- @since 0.0.2.0
propertiesIO :: Document -> IO Properties
propertiesIO (Document docptr) = withForeignPtr docptr \doc -> do
@@ 158,22 151,12 @@ propertiesIO (Document docptr) = withFor
p <- asText $ ffiDocumentProducer doc
s <- asText $ ffiDocumentSubject doc
t <- asText $ ffiDocumentTitle doc
-
-#ifdef XMLC
- return $ Properties (f a) (f c) (f k) (xml m) (f p) (f s) (f t)
- where
- xml x =
- if T.null x
- then Nothing
- else either (const Nothing) Just $ X.parseText X.def (TL.fromStrict x)
-#else
return $ Properties (f a) (f c) (f k) (f m) (f p) (f s) (f t)
where
-#endif
f x =
if T.null x
- then Nothing
- else Just x
+ then Nothing
+ else Just x
-- | Extract text from PDF document with given 'Layout'.
pdftotextIO :: Layout -> Document -> IO T.Text