9478628cfa2e — Linus Björnstam 5 years ago
Changed the implementation a bit.
No more arbitrary newlines, but fixed: 'cr 'lf 'crlf or 'lax.
2 files changed, 46 insertions(+), 21 deletions(-)

M README.md
M dsv-impl.scm
M README.md +13 -5
@@ 1,6 1,6 @@ 
 # Guile-dsv
 
-The delimiter-separated values format is a superset of CSV. This implements a DSV parser for guile with a streaming interface and a more convenient port-exhausting or string-reading interface.
+The delimiter-separated values format is a superset of CSV (although headers are not currently supported). This implements a DSV parser for guile with a streaming interface and a more convenient port-exhausting or string-reading interface.
 
 # Documentation
 

          
@@ 9,8 9,9 @@ The delimiter-separated values format is
     
     ;; These are all the available options for the procedures in this library.
     ;; All options below are the standard ones, and do not have to be provided.
-    ;; #:newline can also be a string of maximum 2 characters.
-    (define reader (make-dsv-reader file #:delimiter #\, #:newline #\newline #:escape #\"))
+    ;; #:newline can be 'cr, 'lf, 'crlf  and 'lax. Lax accepts all other newline
+    ;; characters
+    (define reader (make-dsv-reader file #:delimiter #\, #:newline 'lf #:escape #\"))
     
     ;; reader is now a thunk that returns a vector of dsv cells:
     (reader) ;; => #("my" "delimited" "data")

          
@@ 23,7 24,7 @@ The delimiter-separated values format is
     
     (dsv-file->list "csv.csv") ;; => #("my" "delimited" "data")
     
-    (call-with-input-file "csv.csv" (dsv->list)) ;; => #("my" "delimited" "data")
+    (call-with-input-file "csv.csv" dsv->list) ;; => #("my" "delimited" "data")
     
     ;; Both the above procedures (dsv-file->list and dsv->list) take an 
     ;; optional keyword spec as shown for make-dsv-reader

          
@@ 31,7 32,7 @@ The delimiter-separated values format is
 
 # Speed
 
-It is slightly faster than guile-csv for CSV files, with the bonus that it actually parses proper CSV files with CRLF line endings. This means a 35mb CSV file is parsed in about 3.3s using guile 2.9.4.
+It is slightly faster than guile-csv for CSV files, with the bonus that it actually parses proper CSV files with CRLF line endings. This means a 35mb CSV file is parsed in about 4s using guile 2.9.4. Python is twice as fast, due to it's csv reader being written in optimized and nicely buffered C.
 
 # License
 

          
@@ 41,6 42,13 @@ LGPLv3. See the file header.
 
 I was trying my best to use data-type specific comparisons, but apparently eqv? was faster (probably due to fewer type checks in the generated code). That yielded quite a speed increase. I will have to try to find other such nice little speedups.
 
+Re-add trimming.
+
+Enforce length of rows.
+
+Change the interface to allow composing with call-with-input-xxxx and the likes.
+
 I tried using a bigger string buffer and using the same buffer for each instantiated reader, but that made it run slower than using a new buffer for each line.
 
 Anyway, I would like to write some tests to make sure it outputs correct code. Then I would like to make it fast. After that, I would like to make it pretty.
+

          
M dsv-impl.scm +33 -16
@@ 23,7 23,7 @@ 
 ;; A streaming version of the rather nifte guile-csv written by Andy Wingo
 ;; and Nala Ginrut. It is changed to use a string buffer instead of a list
 ;; (with a new one per row, which happened to be the fastest). It also supports
-;; more features, such as user-configurable line endings (max 2 characters),
+;; more features, such as such as user configurable line separators,
 ;; escape characters and delimiters, althogh it will _not_ play well with
 ;; space-separated values when more than one space separates the values.
 ;; Parsing " " will return two empty strings.

          
@@ 51,11 51,17 @@ 
       (loop (peek-char port)))))
 
 ;; Returns a line-reader.
-(define* (make-dsv-reader port
+(define* (make-dsv-reader port-or-string
                           #:key
                           (delimiter #\,)
-                          (newline #\newline)
+                          (newline 'lf)
                           (escape #\" ))
+  (define port
+    (cond
+     ((port? port-or-string) port-or-string)
+     ((string? port-or-string) (open-input-string port-or-string))
+     (else (error "unsupported argemunt to make-dsv-reader. Expected port or string"))))
+
   (define (getch) (read-char port))
 
   (define (char-quote? ch)

          
@@ 65,19 71,30 @@ 
     (or (eqv? #\" ch)
         (eqv? escape ch)))
 
-  (define (at-newline ch pos)
-    (cond
-     ((eqv? ch newline))
-     ((string? newline)
-        (if (string=? newline (string ch (peek-char port)))
-            (begin
-              (read-char port)
-              #t)
-            #f))
-     (else #f)))
+  (define at-newline
+    (case newline
+      ((cr) (lambda (c) (eqv? c #\return)))
+      ((lf) (lambda (c) (eqv? c #\newline)))
+      ((crlf) (lambda (c pos)
+                (if (eqv? c #\return)
+                    (let ((lf? (peek-char port)))
+                      (cond ((eof-object? lf?) #t) ;; This should really be an error, but let's be nice!
+                            ((eqv? lf? #\newline) (read-char port) #t)
+                            (else #f)))
+                    #f)))
+      ((lax)
+       (lambda (c)
+         (case c
+           ((#\newline) #t)
+           ((#\return)
+            (when (eqv? (peek-char port) #\newline)
+              (read-char port))
+            #t)
+           (else #f))))))
+
 
   (define (finish-row str pos)
-    (list (substring/copy str 0 pos)))
+    (list (substring str 0 pos)))
 
   (define (read-string ch str pos)
     (cond

          
@@ 106,7 123,7 @@ 
      ((eqv? delimiter quote?)
       (cons (substring/copy str 0 pos) (read-any (getch) str 0)))
      ;; The row ended. Finish the row
-     ((at-newline quote? pos)
+     ((at-newline quote?)
       (finish-row str pos))))
 
   (define (read-escaped quote? str pos)

          
@@ 123,7 140,7 @@ 
       (if (zero? pos)
           (read-string (getch) str 0)
           (error "Quote in non-allowed place" (substring/copy str 0 pos) (string ch))))
-     ((at-newline ch pos)
+     ((at-newline ch)
       (finish-row str pos))
      (else
       (read-any (getch) (set-pos! str pos ch) (+ pos 1)))))