Add first version
A => .hgignore +8 -0
@@ 0,0 1,8 @@ 
+syntax: glob
+
+.stack-work/
+dist-newstyle
+test/*.log
+test/*.aux
+stack*.yaml.lock
+test/auto

          
A => CHANGELOG.md +5 -0
@@ 0,0 1,5 @@ 
+# Revision history for pdftotext
+
+## 0.0.1.0 -- 2020-05-10
+
+* First version.

          
A => LICENSE +30 -0
@@ 0,0 1,30 @@ 
+Copyright (c) 2020, G. Eyaeb
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * Neither the name of G. Eyaeb nor the names of other
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

          
A => README.md +25 -0
@@ 0,0 1,25 @@ 
+# pdftotext
+
+The `pdftotext` package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell `pdftotext` library is identical to output of Poppler's tool `pdftotext`.
+
+## Usage
+
+```haskell
+import qualified Data.Text.IO as T
+import Pdftotext
+
+main :: IO ()
+main = do
+  Just pdf <- openFile "path/to/file.pdf"
+  T.putStrLn $ pdftotext Physical pdf
+```
+
+## Internals
+
+The library uses poppler via FFI, therefore internally all functions are of type `IO`. However, their non-`IO` variants (using `unsafePerformIO`) _should be_ safe to use. Module `Pdftotext.Internal` exposes all `IO`-typed functions.
+
+## Contribute
+
+Project is hosted at https://sr.ht/~geyaeb/haskell-pdftotext/ . The homepage provides links to [Mercurial repository](https://hg.sr.ht/~geyaeb/haskell-pdftotext), [mailing list](https://lists.sr.ht/~geyaeb/haskell-pdftotext) and [ticket tracker](https://todo.sr.ht/~geyaeb/haskell-pdftotext).
+
+Patches, suggestions, questions and general discussions can be send to the [mailing list](https://lists.sr.ht/~geyaeb/haskell-pdftotext). Detailed information about sending patches by email can be found at [https://man.sr.ht/hg.sr.ht/email.md](https://man.sr.ht/hg.sr.ht/email.md).

          
A => Setup.hs +2 -0
@@ 0,0 1,2 @@ 
+import Distribution.Simple
+main = defaultMain

          
A => cbits/poppler.cc +56 -0
@@ 0,0 1,56 @@ 
+#include <poppler-document.h>
+#include <poppler-page.h>
+#include <iostream>
+#include <string.h>
+
+extern "C" {
+
+  poppler::document* poppler_document_open_pdf(const char* file) {
+    poppler::document* doc = poppler::document::load_from_file(file);
+    return doc;
+  }
+
+  poppler::document* poppler_document_open_data(const char* data, size_t n) {
+    std::vector<char> d;
+    d.assign(data, data + n);
+    return poppler::document::load_from_data(&d);
+  }
+
+  void poppler_document_delete(poppler::document* doc) {
+    delete doc;
+  }
+
+  int poppler_document_pages(poppler::document* doc) {
+    return doc->pages();
+  }
+
+  poppler::page* poppler_document_open_page(poppler::document* doc, int page) {
+    return doc->create_page(page);
+  }
+
+  std::string* poppler_page_text(poppler::page* page, int layout) {
+    std::vector<char> vc;
+    switch (layout) {
+    case 0: { 
+      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout).to_utf8();
+      break;
+    }
+    case 1: {
+      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout).to_utf8();
+      break;
+    }
+    default: {
+      vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout).to_utf8();
+      break;
+    }
+    }
+    return new std::string(vc.begin(), vc.end());
+  }
+
+  void poppler_page_delete(poppler::page* page) {
+    delete page;
+  }
+
+  
+
+}

          
A => cbits/stdstring.cc +18 -0
@@ 0,0 1,18 @@ 
+#include <string.h>
+#include <iostream>
+
+extern "C" {
+
+  size_t string_get_length(const std::string *s) {
+    return s->length();
+  }
+
+  void string_delete(std::string *s) {
+    delete s;
+  }
+  
+  void string_copy(const std::string s, char* out) {
+    s.copy(out, s.length());
+  }
+  
+}

          
A => pdftotext.cabal +48 -0
@@ 0,0 1,48 @@ 
+cabal-version:       >=1.10
+
+name:                pdftotext
+version:             0.0.1.0
+synopsis:            Extracts text from PDF using poppler
+description:         The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
+homepage:            https://sr.ht/~geyaeb/haskell-pdftotext/
+bug-reports:         https://todo.sr.ht/~geyaeb/haskell-pdftotext
+license:             BSD3
+license-file:        LICENSE
+author:              G. Eyaeb
+maintainer:          geyaeb@protonmail.com
+copyright:           2020 G. Eyaeb
+category:            Text, PDF
+build-type:          Simple
+extra-source-files:  CHANGELOG.md, README.md, test/*.pdf, test/*.txt
+
+source-repository head
+  type:                mercurial
+  location:            https://hg.sr.ht/~geyaeb/haskell-pdftotext
+
+library
+  exposed-modules:     Pdftotext
+                     , Pdftotext.Foreign
+                     , Pdftotext.Internal
+  build-depends:       base >= 4.11 && < 5
+                     , bytestring == 0.10.*
+                     , text == 1.2.*
+  hs-source-dirs:      src
+  ghc-options:         -Wall -Wincomplete-uni-patterns -Wincomplete-record-updates -Wcompat -Widentities -Wredundant-constraints -fhide-source-paths -Wmissing-export-lists -Wpartial-fields
+  default-language:    Haskell2010
+  cc-options:          -Wall -fPIC
+  c-sources:           cbits/poppler.cc
+                     , cbits/stdstring.cc
+  extra-libraries:     stdc++
+  pkgconfig-depends:   poppler-cpp
+
+test-suite pdftotext-test
+  default-language:    Haskell2010
+  build-depends:       pdftotext
+                     , base >= 4.11 && < 5
+                     , text == 1.2.*
+                     , hspec == 2.7.*
+  type:                exitcode-stdio-1.0
+  hs-source-dirs:      test
+  main-is:             Spec.hs
+  other-modules:       PdftotextSpec
+  build-tool-depends:  hspec-discover:hspec-discover == 2.*
  No newline at end of file

          
A => src/Pdftotext.hs +76 -0
@@ 0,0 1,76 @@ 
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module      : Pdftotext
+Description : Extracts text from PDF using poppler
+Copyright   : (c) 2020 G. Eyaeb
+License     : BSD-3-Clause
+Maintainer  : geyaeb@protonmail.com
+Stability   : experimental
+Portability : POSIX
+
+=== Usage
+
+> import qualified Data.Text.IO as T
+> import Pdftotext
+>
+> main :: IO ()
+> main = do
+>   Just pdf <- openFile "path/to/file.pdf"
+>   T.putStrLn $ pdftotext Physical pdf
+
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext
+  ( -- * Types
+    Document,
+    Layout (..),
+    Page,
+
+    -- * Loading PDF's
+    openByteString,
+    openFile,
+
+    -- * Document functions
+    page,
+    pages,
+    pagesTotal,
+    pdftotext,
+
+    -- * Page functions
+    pageNumber,
+    pageOutOf,
+    pageText,
+  )
+where
+
+import Data.ByteString
+import Data.Text (Text)
+import GHC.IO (unsafePerformIO)
+import Pdftotext.Internal
+
+-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openByteString :: ByteString -> Maybe Document
+openByteString = unsafePerformIO . openByteStringIO
+
+-- | Return page number 'no' from PDF document, if the page exists.
+page :: Int -> Document -> Maybe Page
+page no doc = unsafePerformIO $ pageIO no doc
+
+-- | Return all pages from document.
+pages :: Document -> [Page]
+pages = unsafePerformIO . pagesIO
+
+-- | Return number of pages contained in document.
+pagesTotal :: Document -> Int
+pagesTotal = unsafePerformIO . pagesTotalIO
+
+-- | Extract text from PDF document with given 'Layout'.
+pdftotext :: Layout -> Document -> Text
+pdftotext lay doc = unsafePerformIO $ pdftotextIO lay doc
+
+-- | Extract text from a page with given 'Layout'.
+pageText :: Layout -> Page -> Text
+pageText l p = unsafePerformIO $ pageTextIO l p

          
A => src/Pdftotext/Foreign.hs +92 -0
@@ 0,0 1,92 @@ 
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module      : Pdftotext.Foreign
+Description : Foreign interface
+Copyright   : (c) 2020 G. Eyaeb
+License     : BSD-3-Clause
+Maintainer  : geyaeb@protonmail.com
+Stability   : experimental
+Portability : POSIX
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext.Foreign
+  ( -- * C++ objects
+    Poppler_Document,
+    Poppler_Page,
+
+    -- * 'std::string' helper
+    StdString,
+    asText,
+    stringToText,
+
+    -- * FFI
+    ffiOpenPdf,
+    ffiOpenData,
+    ffiDocumentDelete,
+    ffiDocumentPages,
+    ffiDocumentOpenPage,
+    ffiPageDelete,
+    ffiPageText,
+    ffiStringLength,
+    ffiStringDelete,
+    ffiStringCopy,
+  )
+where
+
+import qualified Data.Text as T
+import qualified Data.Text.Foreign as T
+import Foreign
+import Foreign.C
+
+data Poppler_Document
+
+data Poppler_Page
+
+data CStdString
+
+type StdString = Ptr CStdString
+
+foreign import ccall unsafe "poppler_document_open_pdf"
+  ffiOpenPdf :: CString -> IO (Ptr Poppler_Document)
+
+foreign import ccall unsafe "poppler_document_open_data"
+  ffiOpenData :: Ptr Word8 -> CInt -> IO (Ptr Poppler_Document)
+
+foreign import ccall unsafe "&poppler_document_delete"
+  ffiDocumentDelete :: FunPtr (Ptr Poppler_Document -> IO ())
+
+foreign import ccall unsafe "poppler_document_pages"
+  ffiDocumentPages :: Ptr Poppler_Document -> IO CInt
+
+foreign import ccall unsafe "poppler_document_open_page"
+  ffiDocumentOpenPage :: Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
+
+foreign import ccall unsafe "&poppler_page_delete"
+  ffiPageDelete :: FunPtr (Ptr Poppler_Page -> IO ())
+
+foreign import ccall unsafe "poppler_page_text"
+  ffiPageText :: Ptr Poppler_Page -> CBool -> IO StdString
+
+foreign import ccall unsafe "string_get_length"
+  ffiStringLength :: StdString -> IO CUInt
+
+foreign import ccall unsafe "string_delete"
+  ffiStringDelete :: StdString -> IO ()
+
+foreign import ccall unsafe "string_copy"
+  ffiStringCopy :: StdString -> Ptr CChar -> IO ()
+
+-- | Converts `std::string` wrapped in IO into `Data.Text`.
+asText :: IO StdString -> IO T.Text
+asText = (>>= stringToText)
+
+-- | Converts `std::string` into `Data.Text`.
+stringToText :: StdString -> IO T.Text
+stringToText ptr = do
+  len <- fromIntegral <$> ffiStringLength ptr
+  allocaBytes len \out ->
+    ffiStringCopy ptr out
+      >> T.peekCStringLen (out, len)
+      <* ffiStringDelete ptr

          
A => src/Pdftotext/Internal.hs +127 -0
@@ 0,0 1,127 @@ 
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module      : Pdftotext.Internal
+Description : Internal functions
+Copyright   : (c) 2020 G. Eyaeb
+License     : BSD-3-Clause
+Maintainer  : geyaeb@protonmail.com
+Stability   : experimental
+Portability : POSIX
+
+Internal functions.
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext.Internal
+  ( -- * Types
+    Document (..),
+    Layout (..),
+    Page (..),
+
+    -- * Loading PDF's
+    openByteStringIO,
+    openFile,
+
+    -- * Document functions
+    pageIO,
+    pagesIO,
+    pagesTotalIO,
+    pdftotextIO,
+
+    -- * Page functions
+    pageTextIO,
+  )
+where
+
+import Control.Monad (forM)
+import Data.ByteString.Internal
+import qualified Data.Text as T
+import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
+import Foreign.C (withCString)
+import Pdftotext.Foreign
+
+newtype Document = Document (ForeignPtr Poppler_Document)
+
+data Page = Page
+  { -- | Number of this page in original document.
+    pageNumber :: Int,
+    -- | Total number of pages in original document.
+    pageOutOf :: Int,
+    pagePtr :: ForeignPtr Poppler_Page
+  }
+
+instance Show Page where
+  show (Page n o _) = "Page " ++ show n ++ "/" ++ show o
+
+-- | Layout of text extracted from PDF.
+data Layout
+  = -- | Text emulates layout of PDF, including horizontal spaces,
+    -- and preserves hyphenation; corresponds to calling @pdftotext -layout@
+    Physical
+  | -- | Discards horizontal spaces, preserves hyphenation;
+    -- corresponds to calling @pdftotext -raw@
+    Raw
+  | -- | Discards horizontal spaces, removes hyphenation;
+    -- corresponds to calling @pdftotext@ without layout argument
+    None
+  deriving (Eq, Show)
+
+-- | Open PDF from file. If file does not exist or cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openFile :: FilePath -> IO (Maybe Document)
+openFile file =
+  withCString file \cfile -> do
+    docptr <- ffiOpenPdf cfile
+    if docptr == nullPtr
+      then return Nothing
+      else Just . Document <$> newForeignPtr ffiDocumentDelete docptr
+
+-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openByteStringIO :: ByteString -> IO (Maybe Document)
+openByteStringIO (PS ptr _ len) =
+  withForeignPtr ptr \d -> do
+    docptr <- ffiOpenData d (fromIntegral len)
+    if docptr == nullPtr
+      then return Nothing
+      else Just . Document <$> newForeignPtr ffiDocumentDelete docptr
+
+-- | Return all pages from document.
+pagesIO :: Document -> IO [Page]
+pagesIO (Document doc) = do
+  withForeignPtr doc \docptr -> do
+    pageno <- ffiDocumentPages docptr
+    forM [0 .. pageno - 1] \pno -> do
+      p <- ffiDocumentOpenPage docptr pno >>= newForeignPtr ffiPageDelete
+      return $ Page (fromIntegral pno + 1) (fromIntegral pageno) p
+
+-- | Return page number 'no' from PDF document, if the page exists.
+pageIO :: Int -> Document -> IO (Maybe Page)
+pageIO no d@(Document docptr) = withForeignPtr docptr \ptr -> do
+  pno <- pagesTotalIO d
+  if no > 0 && no <= pno
+    then Just . Page no pno <$> (ffiDocumentOpenPage ptr (fromIntegral no - 1) >>= newForeignPtr ffiPageDelete)
+    else return Nothing
+
+-- | Return number of pages contained in document.
+pagesTotalIO :: Document -> IO Int
+pagesTotalIO (Document doc) =
+  fromIntegral <$> withForeignPtr doc ffiDocumentPages
+
+-- | Extract text from a page with given 'Layout'.
+pageTextIO :: Layout -> Page -> IO T.Text
+pageTextIO layout (Page _ _ ptr) = withForeignPtr ptr \p -> asText (ffiPageText p l)
+  where
+    l =
+      case layout of
+        Raw -> 0
+        Physical -> 1
+        None -> 2
+
+-- | Extract text from PDF document with given 'Layout'.
+pdftotextIO :: Layout -> Document -> IO T.Text
+pdftotextIO layout doc = do
+  ps <- pagesIO doc
+  txt <- mapM (pageTextIO layout) ps
+  return $ T.concat txt

          
A => stack.yaml +3 -0
@@ 0,0 1,3 @@ 
+resolver: lts-15.11
+packages:
+  - .

          
A => test/PdftotextSpec.hs +32 -0
@@ 0,0 1,32 @@ 
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module PdftotextSpec (spec) where
+
+import qualified Data.Text.IO as T
+import Pdftotext
+import Test.Hspec
+
+spec :: Spec
+spec = do
+  before (openFile "test/simple.pdf") do
+    describe "pdftotext with layout `None`" do
+      it "produces same output as `pdftotext simple.pdf`" \(Just doc) -> do
+        exp <- T.readFile "test/simple_none.txt"
+        pdftotext None doc `shouldBe` exp
+
+    describe "pdftotext with layout `Raw`" do
+      it "produces same output as `pdftotext -raw simple.pdf`" \(Just doc) -> do
+        exp <- T.readFile "test/simple_raw.txt"
+        pdftotext Raw doc `shouldBe` exp
+
+    describe "pdftotext with layout `Physical`" do
+      it "produces same output as `pdftotext -layout simple.pdf`" \(Just doc) -> do
+        exp <- T.readFile "test/simple_physical.txt"
+        pdftotext Physical doc `shouldBe` exp
+
+    describe "PDF" do
+      it "should have expected number of pages (`pagesTotal`)" \(Just doc) ->
+        pagesTotal doc `shouldBe` 4
+      it "should contain correct number of pages (`pages`)" \(Just doc) ->
+        length (pages doc) `shouldBe` 4

          
A => test/Spec.hs +1 -0
@@ 0,0 1,1 @@ 
+{-# OPTIONS_GHC -F -pgmF hspec-discover #-}

          
A => test/simple.pdf +0 -0

        
A => test/simple.tex +22 -0
@@ 0,0 1,22 @@ 
+\documentclass{article}
+
+\title{Simple document for testing}
+\date{}
+\pagenumbering{gobble}
+
+\begin{document}
+
+\maketitle
+\newpage
+
+They who can give up
+\newpage
+
+\parbox{1cm}{
+essential liberty to obtain a little temporary safety
+}
+\newpage
+
+\hspace{2cm} deserve neither\\
+liberty nor safety.
+\end{document}
  No newline at end of file

          
A => test/simple_none.txt +16 -0
@@ 0,0 1,16 @@ 
+Simple document for testing
+
+They who can give up
+
+essential
+liberty
+to
+obtain
+a little
+temporary
+safety
+
+deserve neither
+liberty nor safety.
+
+
  No newline at end of file

          
A => test/simple_physical.txt +15 -0
@@ 0,0 1,15 @@ 
+Simple document for testing
+They who can give up
+essential
+lib-
+erty
+to
+obtain
+a little
+tem-
+po-
+rary
+safety
+                  deserve neither
+liberty nor safety.
+
  No newline at end of file

          
A => test/simple_raw.txt +15 -0
@@ 0,0 1,15 @@ 
+Simple document for testing
+They who can give up
+essential
+lib-
+erty
+to
+obtain
+a little
+tem-
+po-
+rary
+safety
+deserve neither
+liberty nor safety.
+
  No newline at end of file