5 files changed, 211 insertions(+), 35 deletions(-)

M CHANGELOG.md
M README.md
A => cli/Cli.hs
M pdftotext.cabal
M src/Pdftotext/Internal.hs
M CHANGELOG.md +5 -0
@@ 1,5 1,10 @@ 
 # Revision history for pdftotext
 
+## 0.1.0.0 -- 
+
+* Added executable `pdftotext.hs`
+* Removed `xml-conduit` flag, it was bad idea, sorry for that
+
 ## 0.0.2.0 -- 2020-06-11
 
 * Added PDF document properties (author, title etc.)

          
M README.md +30 -3
@@ 14,11 14,38 @@ main = do
   T.putStrLn $ pdftotext Physical pdf
 ```
 
-## Flags
+## Executable
+
+`pdftotext` comes with executable program `pdftotext.hs` which can print text extracted from PDF and basic information from the document.
 
-### `xml-conduit`
+```shell
+$> pdftotext.hs info test/simple.pdf
+File      : test/simple.pdf
+Pages     : 4
+Properties
+  Title   : Simple document for testing
+  Author  : G. Eyaeb
+  Subject : Testing
+  Creator : pdflatex
+  Producer: LaTeX with hyperref
+  Keywords: haskell,pdf
+```
 
-`pdftotext` can extract properties from PDF document. One of them is metadata which is in form of XML document. If `xml-conduit` flag is set then the metadata is parsed using `xml-conduit` package, otherwise provided as text.
+```shell
+$> pdftotext.hs print --pages 1,4 test/simple.pdf
+Simple document for testing
+
+                  deserve neither
+liberty nor safety.
+```
+
+See help for more information:
+
+```shell
+$> pdftotext.hs --help
+$> pdftotext.hs print --help
+$> pdftotext.hs info --help
+```
 
 ## Internals
 

          
A => cli/Cli.hs +155 -0
@@ 0,0 1,155 @@ 
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
+
+module Main (main) where
+
+import Data.Aeson ((.=), ToJSON (..), defaultOptions, genericToJSON, object)
+import Data.Aeson.Text (encodeToLazyText)
+import Data.Bifunctor (first)
+import Data.List (sort)
+import Data.Maybe (catMaybes)
+import Data.Range (Range, fromRanges, intersection, lbi)
+import Data.Range.Parser (parseRanges)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.IO as TL
+import Options.Applicative
+import Pdftotext.Internal
+import qualified Text.PrettyPrint.ANSI.Leijen as P
+
+data Command
+  = Print PrintOptions
+  | Info InfoOptions
+
+data PrintOptions = PrintOptions
+  { prtFile :: FilePath,
+    prtPages :: [Range Int],
+    prtOutfile :: Maybe FilePath,
+    prtSeparate :: Bool,
+    prtColor :: Bool,
+    prtViewer :: Bool
+  }
+  deriving (Show)
+
+data InfoFormat = JsonFormat | PlainFormat deriving (Eq, Show)
+
+data InfoOptions = InfoOptions
+  { infFile :: FilePath,
+    infFormat :: InfoFormat
+  }
+  deriving (Show)
+
+data Information = Information
+  { iProperties :: Properties,
+    iFile :: FilePath,
+    iPages :: Int
+  }
+  deriving (Show)
+
+instance ToJSON Information where
+  toJSON Information {..} =
+    object
+      [ "file" .= iFile,
+        "pages" .= iPages,
+        "properties" .= genericToJSON defaultOptions iProperties
+      ]
+
+main :: IO ()
+main =
+  execParser
+    ( info
+        (commandParser <**> helper)
+        (fullDesc <> progDesc "Extract text from PDF")
+    )
+    >>= \case
+      Print opts -> printDocument opts
+      Info opts -> printInfo opts
+
+commandParser :: Parser Command
+commandParser =
+  hsubparser
+    ( command "print" (info printOptions (progDesc "Print extracted text" <> footer "RANGE: -3,5,7-12,15,20-"))
+        <> command "info" (info infoOptions (progDesc "Show information about document"))
+    )
+
+infoOptions :: Parser Command
+infoOptions =
+  fmap Info $
+    InfoOptions
+      <$> strArgument (metavar "FILE" <> help "PDF file")
+      <*> option format (long "format" <> short 'f' <> help "Output format (plain, json)" <> value PlainFormat)
+  where
+    format =
+      eitherReader \case
+        "json" -> Right JsonFormat
+        "plain" -> Right PlainFormat
+        f -> Left $ f ++ " is not a valid output format, use one of: plain, json"
+
+printOptions :: Parser Command
+printOptions =
+  fmap Print $
+    PrintOptions
+      <$> strArgument (metavar "FILE" <> help "PDF file")
+      <*> option range (long "pages" <> short 'p' <> help "Range of pages to process" <> metavar "RANGE" <> value [])
+      <*> pure Nothing -- switch (metavar "FILE" <> long "output" <> short 'o' <> help "Write output to file")
+      <*> pure False -- switch (long "separate" <> help "Separate pages")
+      <*> pure False -- switch (long "color" <> short "c" <> help "Use colors")
+      <*> pure False -- switch (long "viewer" <> short "v" <> help "Use internal viewer")
+  where
+    range = eitherReader (first show . parseRanges)
+
+printDocument :: PrintOptions -> IO ()
+printDocument PrintOptions {..} = do
+  f <- openFile prtFile
+  case f of
+    Just d -> do
+      pageNo <- pagesTotalIO d
+      pages <- mapM (flip pageIO d) (pageList pageNo prtPages)
+      txt <- mapM (pageTextIO Physical) (catMaybes pages)
+      T.putStrLn (T.concat txt)
+    _ -> putStrLn $ prtFile ++ " is not a valid PDF document"
+
+pageList :: Int -> [Range Int] -> [Int]
+pageList total [] = [1 .. total]
+pageList total ranges =
+  sort
+    $ filter (<= total)
+    $ take total
+    $ fromRanges
+    $ intersection [lbi 1] ranges
+
+printInfo :: InfoOptions -> IO ()
+printInfo InfoOptions {..} = do
+  f <- openFile infFile
+  case f of
+    Just d -> do
+      p <- propertiesIO d
+      pageno <- pagesTotalIO d
+      let i = Information p infFile pageno
+      case infFormat of
+        JsonFormat -> printInfoJson i
+        PlainFormat -> printInfoPlain i
+    _ -> putStrLn $ infFile ++ " is not a valid PDF document"
+
+printInfoJson :: Information -> IO ()
+printInfoJson p = TL.putStrLn (encodeToLazyText $ toJSON p)
+
+{- ORMOLU_DISABLE -}
+printInfoPlain :: Information -> IO ()
+printInfoPlain Information{..} =
+  P.putDoc $
+    P.text "File      :" P.<+> P.text iFile P.<$>
+    P.text "Pages     :" P.<+> P.text (show iPages) P.<$>
+    P.text "Properties" P.<$>
+      P.indent 2 (
+          P.text "Title   :" P.<+> P.text (maybe "" T.unpack title)    P.<$>
+          P.text "Author  :" P.<+> P.text (maybe "" T.unpack author)   P.<$>
+          P.text "Subject :" P.<+> P.text (maybe "" T.unpack subject)  P.<$>
+          P.text "Creator :" P.<+> P.text (maybe "" T.unpack creator)  P.<$>
+          P.text "Producer:" P.<+> P.text (maybe "" T.unpack producer) P.<$>
+          P.text "Keywords:" P.<+> P.text (maybe "" T.unpack keywords)
+        ) P.<> P.hardline
+  where Properties{..} = iProperties
+{- ORMOLU_ENABLE -}

          
M pdftotext.cabal +15 -9
@@ 1,7 1,7 @@ 
 cabal-version:       >=1.10
 
 name:                pdftotext
-version:             0.0.2.0
+version:             0.1.0.0
 synopsis:            Extracts text from PDF using poppler
 description:         The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
 homepage:            https://sr.ht/~geyaeb/haskell-pdftotext/

          
@@ 18,10 18,6 @@ extra-source-files:  CHANGELOG.md, READM
 source-repository head
   type:                mercurial
   location:            https://hg.sr.ht/~geyaeb/haskell-pdftotext
-
-flag xml-conduit
-  description:         Parse metadata of PDF document properties using xml-conduit
-  default:             False
   
 library
   exposed-modules:     Pdftotext

          
@@ 38,9 34,19 @@ library
                      , cbits/stdstring.cc
   extra-libraries:     stdc++
   pkgconfig-depends:   poppler-cpp
-  if flag(xml-conduit)
-     build-depends:    xml-conduit == 1.8.*
-     cpp-options:      -DXMLC
+
+executable pdftotext.hs
+  build-depends:       base >= 4.11 && < 5
+                     , aeson >= 1.4 && < 1.6
+                     , ansi-wl-pprint == 0.6.*
+                     , optparse-applicative == 0.15.*
+                     , pdftotext
+                     , range == 0.3.*
+                     , text == 1.2.*
+  hs-source-dirs:      cli
+  main-is:             Cli.hs
+  ghc-options:         -Wall -Wincomplete-uni-patterns -Wincomplete-record-updates -Wcompat -Widentities -Wredundant-constraints -fhide-source-paths -Wmissing-export-lists -Wpartial-fields
+  default-language:    Haskell2010
 
 test-suite pdftotext-test
   default-language:    Haskell2010

          
@@ 52,4 58,4 @@ test-suite pdftotext-test
   hs-source-dirs:      test
   main-is:             Spec.hs
   other-modules:       PdftotextSpec
-  build-tool-depends:  hspec-discover:hspec-discover == 2.*
  No newline at end of file
+  build-tool-depends:  hspec-discover:hspec-discover == 2.*

          
M src/Pdftotext/Internal.hs +6 -23
@@ 1,5 1,5 @@ 
 {-# LANGUAGE BlockArguments #-}
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveGeneric #-}
 
 {- ORMOLU_DISABLE -}
 {-|

          
@@ 42,13 42,9 @@ import Data.ByteString.Internal
 import qualified Data.Text as T
 import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
 import Foreign.C (withCString)
+import GHC.Generics
 import Pdftotext.Foreign
 
-#ifdef XMLC
-import qualified Text.XML as X
-import qualified Data.Text.Lazy as TL
-#endif
-
 newtype Document = Document (ForeignPtr Poppler_Document)
 
 -- | Document properties.

          
@@ 60,16 56,12 @@ data Properties = Properties
   { author :: Maybe T.Text,
     creator :: Maybe T.Text,
     keywords :: Maybe T.Text,
-#ifdef XMLC
-    metadata :: Maybe X.Document,
-#else
     metadata :: Maybe T.Text,
-#endif
     producer :: Maybe T.Text,
     subject :: Maybe T.Text,
     title :: Maybe T.Text
   }
-  deriving (Show)
+  deriving (Show, Generic)
 
 data Page = Page
   { -- | Number of this page in original document.

          
@@ 148,6 140,7 @@ pageTextIO layout (Page _ _ ptr) = withF
         None -> 2
 
 -- | Extract properties from the document.
+--
 -- @since 0.0.2.0
 propertiesIO :: Document -> IO Properties
 propertiesIO (Document docptr) = withForeignPtr docptr \doc -> do

          
@@ 158,22 151,12 @@ propertiesIO (Document docptr) = withFor
   p <- asText $ ffiDocumentProducer doc
   s <- asText $ ffiDocumentSubject doc
   t <- asText $ ffiDocumentTitle doc
-
-#ifdef XMLC
-  return $ Properties (f a) (f c) (f k) (xml m) (f p) (f s) (f t)
-  where
-    xml x =
-      if T.null x
-      then Nothing
-      else either (const Nothing) Just $ X.parseText X.def (TL.fromStrict x)
-#else
   return $ Properties (f a) (f c) (f k) (f m) (f p) (f s) (f t)
   where
-#endif
     f x =
       if T.null x
-      then Nothing
-      else Just x
+        then Nothing
+        else Just x
 
 -- | Extract text from PDF document with given 'Layout'.
 pdftotextIO :: Layout -> Document -> IO T.Text