A => .hgignore +8 -0
@@ 0,0 1,8 @@
+syntax: glob
+
+.stack-work/
+dist-newstyle
+test/*.log
+test/*.aux
+stack*.yaml.lock
+test/auto
A => CHANGELOG.md +5 -0
@@ 0,0 1,5 @@
+# Revision history for pdftotext
+
+## 0.0.1.0 -- 2020-05-10
+
+* First version.
A => LICENSE +30 -0
@@ 0,0 1,30 @@
+Copyright (c) 2020, G. Eyaeb
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of G. Eyaeb nor the names of other
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
A => README.md +25 -0
@@ 0,0 1,25 @@
+# pdftotext
+
+The `pdftotext` package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell `pdftotext` library is identical to output of Poppler's tool `pdftotext`.
+
+## Usage
+
+```haskell
+import qualified Data.Text.IO as T
+import Pdftotext
+
+main :: IO ()
+main = do
+ Just pdf <- openFile "path/to/file.pdf"
+ T.putStrLn $ pdftotext Physical pdf
+```
+
+## Internals
+
+The library uses poppler via FFI, therefore internally all functions are of type `IO`. However, their non-`IO` variants (using `unsafePerformIO`) _should be_ safe to use. Module `Pdftotext.Internal` exposes all `IO`-typed functions.
+
+## Contribute
+
+Project is hosted at https://sr.ht/~geyaeb/haskell-pdftotext/ . The homepage provides links to [Mercurial repository](https://hg.sr.ht/~geyaeb/haskell-pdftotext), [mailing list](https://lists.sr.ht/~geyaeb/haskell-pdftotext) and [ticket tracker](https://todo.sr.ht/~geyaeb/haskell-pdftotext).
+
+Patches, suggestions, questions and general discussions can be send to the [mailing list](https://lists.sr.ht/~geyaeb/haskell-pdftotext). Detailed information about sending patches by email can be found at [https://man.sr.ht/hg.sr.ht/email.md](https://man.sr.ht/hg.sr.ht/email.md).
A => Setup.hs +2 -0
@@ 0,0 1,2 @@
+import Distribution.Simple
+main = defaultMain
A => cbits/poppler.cc +56 -0
@@ 0,0 1,56 @@
+#include <poppler-document.h>
+#include <poppler-page.h>
+#include <iostream>
+#include <string.h>
+
+extern "C" {
+
+ poppler::document* poppler_document_open_pdf(const char* file) {
+ poppler::document* doc = poppler::document::load_from_file(file);
+ return doc;
+ }
+
+ poppler::document* poppler_document_open_data(const char* data, size_t n) {
+ std::vector<char> d;
+ d.assign(data, data + n);
+ return poppler::document::load_from_data(&d);
+ }
+
+ void poppler_document_delete(poppler::document* doc) {
+ delete doc;
+ }
+
+ int poppler_document_pages(poppler::document* doc) {
+ return doc->pages();
+ }
+
+ poppler::page* poppler_document_open_page(poppler::document* doc, int page) {
+ return doc->create_page(page);
+ }
+
+ std::string* poppler_page_text(poppler::page* page, int layout) {
+ std::vector<char> vc;
+ switch (layout) {
+ case 0: {
+ vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::raw_order_layout).to_utf8();
+ break;
+ }
+ case 1: {
+ vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::physical_layout).to_utf8();
+ break;
+ }
+ default: {
+ vc = page->text(poppler::rectf(), poppler::page::text_layout_enum::non_raw_non_physical_layout).to_utf8();
+ break;
+ }
+ }
+ return new std::string(vc.begin(), vc.end());
+ }
+
+ void poppler_page_delete(poppler::page* page) {
+ delete page;
+ }
+
+
+
+}
A => cbits/stdstring.cc +18 -0
@@ 0,0 1,18 @@
+#include <string.h>
+#include <iostream>
+
+extern "C" {
+
+ size_t string_get_length(const std::string *s) {
+ return s->length();
+ }
+
+ void string_delete(std::string *s) {
+ delete s;
+ }
+
+ void string_copy(const std::string s, char* out) {
+ s.copy(out, s.length());
+ }
+
+}
A => pdftotext.cabal +48 -0
@@ 0,0 1,48 @@
+cabal-version: >=1.10
+
+name: pdftotext
+version: 0.0.1.0
+synopsis: Extracts text from PDF using poppler
+description: The @pdftotext@ package provides functions for extraction of plain text from PDF documents. It uses C++ library [Poppler](https://poppler.freedesktop.org/), which is required to be installed in the system. Output of Haskell @pdftotext@ library is identical to output of Poppler's tool @pdftotext@.
+homepage: https://sr.ht/~geyaeb/haskell-pdftotext/
+bug-reports: https://todo.sr.ht/~geyaeb/haskell-pdftotext
+license: BSD3
+license-file: LICENSE
+author: G. Eyaeb
+maintainer: geyaeb@protonmail.com
+copyright: 2020 G. Eyaeb
+category: Text, PDF
+build-type: Simple
+extra-source-files: CHANGELOG.md, README.md, test/*.pdf, test/*.txt
+
+source-repository head
+ type: mercurial
+ location: https://hg.sr.ht/~geyaeb/haskell-pdftotext
+
+library
+ exposed-modules: Pdftotext
+ , Pdftotext.Foreign
+ , Pdftotext.Internal
+ build-depends: base >= 4.11 && < 5
+ , bytestring == 0.10.*
+ , text == 1.2.*
+ hs-source-dirs: src
+ ghc-options: -Wall -Wincomplete-uni-patterns -Wincomplete-record-updates -Wcompat -Widentities -Wredundant-constraints -fhide-source-paths -Wmissing-export-lists -Wpartial-fields
+ default-language: Haskell2010
+ cc-options: -Wall -fPIC
+ c-sources: cbits/poppler.cc
+ , cbits/stdstring.cc
+ extra-libraries: stdc++
+ pkgconfig-depends: poppler-cpp
+
+test-suite pdftotext-test
+ default-language: Haskell2010
+ build-depends: pdftotext
+ , base >= 4.11 && < 5
+ , text == 1.2.*
+ , hspec == 2.7.*
+ type: exitcode-stdio-1.0
+ hs-source-dirs: test
+ main-is: Spec.hs
+ other-modules: PdftotextSpec
+ build-tool-depends: hspec-discover:hspec-discover == 2.*
No newline at end of file
A => src/Pdftotext.hs +76 -0
@@ 0,0 1,76 @@
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module : Pdftotext
+Description : Extracts text from PDF using poppler
+Copyright : (c) 2020 G. Eyaeb
+License : BSD-3-Clause
+Maintainer : geyaeb@protonmail.com
+Stability : experimental
+Portability : POSIX
+
+=== Usage
+
+> import qualified Data.Text.IO as T
+> import Pdftotext
+>
+> main :: IO ()
+> main = do
+> Just pdf <- openFile "path/to/file.pdf"
+> T.putStrLn $ pdftotext Physical pdf
+
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext
+ ( -- * Types
+ Document,
+ Layout (..),
+ Page,
+
+ -- * Loading PDF's
+ openByteString,
+ openFile,
+
+ -- * Document functions
+ page,
+ pages,
+ pagesTotal,
+ pdftotext,
+
+ -- * Page functions
+ pageNumber,
+ pageOutOf,
+ pageText,
+ )
+where
+
+import Data.ByteString
+import Data.Text (Text)
+import GHC.IO (unsafePerformIO)
+import Pdftotext.Internal
+
+-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openByteString :: ByteString -> Maybe Document
+openByteString = unsafePerformIO . openByteStringIO
+
+-- | Return page number 'no' from PDF document, if the page exists.
+page :: Int -> Document -> Maybe Page
+page no doc = unsafePerformIO $ pageIO no doc
+
+-- | Return all pages from document.
+pages :: Document -> [Page]
+pages = unsafePerformIO . pagesIO
+
+-- | Return number of pages contained in document.
+pagesTotal :: Document -> Int
+pagesTotal = unsafePerformIO . pagesTotalIO
+
+-- | Extract text from PDF document with given 'Layout'.
+pdftotext :: Layout -> Document -> Text
+pdftotext lay doc = unsafePerformIO $ pdftotextIO lay doc
+
+-- | Extract text from a page with given 'Layout'.
+pageText :: Layout -> Page -> Text
+pageText l p = unsafePerformIO $ pageTextIO l p
A => src/Pdftotext/Foreign.hs +92 -0
@@ 0,0 1,92 @@
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module : Pdftotext.Foreign
+Description : Foreign interface
+Copyright : (c) 2020 G. Eyaeb
+License : BSD-3-Clause
+Maintainer : geyaeb@protonmail.com
+Stability : experimental
+Portability : POSIX
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext.Foreign
+ ( -- * C++ objects
+ Poppler_Document,
+ Poppler_Page,
+
+ -- * 'std::string' helper
+ StdString,
+ asText,
+ stringToText,
+
+ -- * FFI
+ ffiOpenPdf,
+ ffiOpenData,
+ ffiDocumentDelete,
+ ffiDocumentPages,
+ ffiDocumentOpenPage,
+ ffiPageDelete,
+ ffiPageText,
+ ffiStringLength,
+ ffiStringDelete,
+ ffiStringCopy,
+ )
+where
+
+import qualified Data.Text as T
+import qualified Data.Text.Foreign as T
+import Foreign
+import Foreign.C
+
+data Poppler_Document
+
+data Poppler_Page
+
+data CStdString
+
+type StdString = Ptr CStdString
+
+foreign import ccall unsafe "poppler_document_open_pdf"
+ ffiOpenPdf :: CString -> IO (Ptr Poppler_Document)
+
+foreign import ccall unsafe "poppler_document_open_data"
+ ffiOpenData :: Ptr Word8 -> CInt -> IO (Ptr Poppler_Document)
+
+foreign import ccall unsafe "&poppler_document_delete"
+ ffiDocumentDelete :: FunPtr (Ptr Poppler_Document -> IO ())
+
+foreign import ccall unsafe "poppler_document_pages"
+ ffiDocumentPages :: Ptr Poppler_Document -> IO CInt
+
+foreign import ccall unsafe "poppler_document_open_page"
+ ffiDocumentOpenPage :: Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
+
+foreign import ccall unsafe "&poppler_page_delete"
+ ffiPageDelete :: FunPtr (Ptr Poppler_Page -> IO ())
+
+foreign import ccall unsafe "poppler_page_text"
+ ffiPageText :: Ptr Poppler_Page -> CBool -> IO StdString
+
+foreign import ccall unsafe "string_get_length"
+ ffiStringLength :: StdString -> IO CUInt
+
+foreign import ccall unsafe "string_delete"
+ ffiStringDelete :: StdString -> IO ()
+
+foreign import ccall unsafe "string_copy"
+ ffiStringCopy :: StdString -> Ptr CChar -> IO ()
+
+-- | Converts `std::string` wrapped in IO into `Data.Text`.
+asText :: IO StdString -> IO T.Text
+asText = (>>= stringToText)
+
+-- | Converts `std::string` into `Data.Text`.
+stringToText :: StdString -> IO T.Text
+stringToText ptr = do
+ len <- fromIntegral <$> ffiStringLength ptr
+ allocaBytes len \out ->
+ ffiStringCopy ptr out
+ >> T.peekCStringLen (out, len)
+ <* ffiStringDelete ptr
A => src/Pdftotext/Internal.hs +127 -0
@@ 0,0 1,127 @@
+{-# LANGUAGE BlockArguments #-}
+
+{- ORMOLU_DISABLE -}
+{-|
+Module : Pdftotext.Internal
+Description : Internal functions
+Copyright : (c) 2020 G. Eyaeb
+License : BSD-3-Clause
+Maintainer : geyaeb@protonmail.com
+Stability : experimental
+Portability : POSIX
+
+Internal functions.
+-}
+{- ORMOLU_ENABLE -}
+module Pdftotext.Internal
+ ( -- * Types
+ Document (..),
+ Layout (..),
+ Page (..),
+
+ -- * Loading PDF's
+ openByteStringIO,
+ openFile,
+
+ -- * Document functions
+ pageIO,
+ pagesIO,
+ pagesTotalIO,
+ pdftotextIO,
+
+ -- * Page functions
+ pageTextIO,
+ )
+where
+
+import Control.Monad (forM)
+import Data.ByteString.Internal
+import qualified Data.Text as T
+import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
+import Foreign.C (withCString)
+import Pdftotext.Foreign
+
+newtype Document = Document (ForeignPtr Poppler_Document)
+
+data Page = Page
+ { -- | Number of this page in original document.
+ pageNumber :: Int,
+ -- | Total number of pages in original document.
+ pageOutOf :: Int,
+ pagePtr :: ForeignPtr Poppler_Page
+ }
+
+instance Show Page where
+ show (Page n o _) = "Page " ++ show n ++ "/" ++ show o
+
+-- | Layout of text extracted from PDF.
+data Layout
+ = -- | Text emulates layout of PDF, including horizontal spaces,
+ -- and preserves hyphenation; corresponds to calling @pdftotext -layout@
+ Physical
+ | -- | Discards horizontal spaces, preserves hyphenation;
+ -- corresponds to calling @pdftotext -raw@
+ Raw
+ | -- | Discards horizontal spaces, removes hyphenation;
+ -- corresponds to calling @pdftotext@ without layout argument
+ None
+ deriving (Eq, Show)
+
+-- | Open PDF from file. If file does not exist or cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openFile :: FilePath -> IO (Maybe Document)
+openFile file =
+ withCString file \cfile -> do
+ docptr <- ffiOpenPdf cfile
+ if docptr == nullPtr
+ then return Nothing
+ else Just . Document <$> newForeignPtr ffiDocumentDelete docptr
+
+-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
+-- `Nothing` is returned.
+openByteStringIO :: ByteString -> IO (Maybe Document)
+openByteStringIO (PS ptr _ len) =
+ withForeignPtr ptr \d -> do
+ docptr <- ffiOpenData d (fromIntegral len)
+ if docptr == nullPtr
+ then return Nothing
+ else Just . Document <$> newForeignPtr ffiDocumentDelete docptr
+
+-- | Return all pages from document.
+pagesIO :: Document -> IO [Page]
+pagesIO (Document doc) = do
+ withForeignPtr doc \docptr -> do
+ pageno <- ffiDocumentPages docptr
+ forM [0 .. pageno - 1] \pno -> do
+ p <- ffiDocumentOpenPage docptr pno >>= newForeignPtr ffiPageDelete
+ return $ Page (fromIntegral pno + 1) (fromIntegral pageno) p
+
+-- | Return page number 'no' from PDF document, if the page exists.
+pageIO :: Int -> Document -> IO (Maybe Page)
+pageIO no d@(Document docptr) = withForeignPtr docptr \ptr -> do
+ pno <- pagesTotalIO d
+ if no > 0 && no <= pno
+ then Just . Page no pno <$> (ffiDocumentOpenPage ptr (fromIntegral no - 1) >>= newForeignPtr ffiPageDelete)
+ else return Nothing
+
+-- | Return number of pages contained in document.
+pagesTotalIO :: Document -> IO Int
+pagesTotalIO (Document doc) =
+ fromIntegral <$> withForeignPtr doc ffiDocumentPages
+
+-- | Extract text from a page with given 'Layout'.
+pageTextIO :: Layout -> Page -> IO T.Text
+pageTextIO layout (Page _ _ ptr) = withForeignPtr ptr \p -> asText (ffiPageText p l)
+ where
+ l =
+ case layout of
+ Raw -> 0
+ Physical -> 1
+ None -> 2
+
+-- | Extract text from PDF document with given 'Layout'.
+pdftotextIO :: Layout -> Document -> IO T.Text
+pdftotextIO layout doc = do
+ ps <- pagesIO doc
+ txt <- mapM (pageTextIO layout) ps
+ return $ T.concat txt
A => stack.yaml +3 -0
@@ 0,0 1,3 @@
+resolver: lts-15.11
+packages:
+ - .
A => test/PdftotextSpec.hs +32 -0
@@ 0,0 1,32 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module PdftotextSpec (spec) where
+
+import qualified Data.Text.IO as T
+import Pdftotext
+import Test.Hspec
+
+spec :: Spec
+spec = do
+ before (openFile "test/simple.pdf") do
+ describe "pdftotext with layout `None`" do
+ it "produces same output as `pdftotext simple.pdf`" \(Just doc) -> do
+ exp <- T.readFile "test/simple_none.txt"
+ pdftotext None doc `shouldBe` exp
+
+ describe "pdftotext with layout `Raw`" do
+ it "produces same output as `pdftotext -raw simple.pdf`" \(Just doc) -> do
+ exp <- T.readFile "test/simple_raw.txt"
+ pdftotext Raw doc `shouldBe` exp
+
+ describe "pdftotext with layout `Physical`" do
+ it "produces same output as `pdftotext -layout simple.pdf`" \(Just doc) -> do
+ exp <- T.readFile "test/simple_physical.txt"
+ pdftotext Physical doc `shouldBe` exp
+
+ describe "PDF" do
+ it "should have expected number of pages (`pagesTotal`)" \(Just doc) ->
+ pagesTotal doc `shouldBe` 4
+ it "should contain correct number of pages (`pages`)" \(Just doc) ->
+ length (pages doc) `shouldBe` 4
A => test/Spec.hs +1 -0
@@ 0,0 1,1 @@
+{-# OPTIONS_GHC -F -pgmF hspec-discover #-}
A => test/simple.pdf +0 -0
A => test/simple.tex +22 -0
@@ 0,0 1,22 @@
+\documentclass{article}
+
+\title{Simple document for testing}
+\date{}
+\pagenumbering{gobble}
+
+\begin{document}
+
+\maketitle
+\newpage
+
+They who can give up
+\newpage
+
+\parbox{1cm}{
+essential liberty to obtain a little temporary safety
+}
+\newpage
+
+\hspace{2cm} deserve neither\\
+liberty nor safety.
+\end{document}
No newline at end of file
A => test/simple_none.txt +16 -0
@@ 0,0 1,16 @@
+Simple document for testing
+
+They who can give up
+
+essential
+liberty
+to
+obtain
+a little
+temporary
+safety
+
+deserve neither
+liberty nor safety.
+
+
No newline at end of file
A => test/simple_physical.txt +15 -0
@@ 0,0 1,15 @@
+Simple document for testing
+They who can give up
+essential
+lib-
+erty
+to
+obtain
+a little
+tem-
+po-
+rary
+safety
+ deserve neither
+liberty nor safety.
+
No newline at end of file
A => test/simple_raw.txt +15 -0
@@ 0,0 1,15 @@
+Simple document for testing
+They who can give up
+essential
+lib-
+erty
+to
+obtain
+a little
+tem-
+po-
+rary
+safety
+deserve neither
+liberty nor safety.
+
No newline at end of file