@@ 1,6 1,6 @@
# Guile-dsv
-The delimiter-separated values format is a superset of CSV. This implements a DSV parser for guile with a streaming interface and a more convenient port-exhausting or string-reading interface.
+The delimiter-separated values format is a superset of CSV (although headers are not currently supported). This implements a DSV parser for guile with a streaming interface and a more convenient port-exhausting or string-reading interface.
# Documentation
@@ 9,8 9,9 @@ The delimiter-separated values format is
;; These are all the available options for the procedures in this library.
;; All options below are the standard ones, and do not have to be provided.
- ;; #:newline can also be a string of maximum 2 characters.
- (define reader (make-dsv-reader file #:delimiter #\, #:newline #\newline #:escape #\"))
+ ;; #:newline can be 'cr, 'lf, 'crlf and 'lax. Lax accepts all other newline
+ ;; characters
+ (define reader (make-dsv-reader file #:delimiter #\, #:newline 'lf #:escape #\"))
;; reader is now a thunk that returns a vector of dsv cells:
(reader) ;; => #("my" "delimited" "data")
@@ 23,7 24,7 @@ The delimiter-separated values format is
(dsv-file->list "csv.csv") ;; => #("my" "delimited" "data")
- (call-with-input-file "csv.csv" (dsv->list)) ;; => #("my" "delimited" "data")
+ (call-with-input-file "csv.csv" dsv->list) ;; => #("my" "delimited" "data")
;; Both the above procedures (dsv-file->list and dsv->list) take an
;; optional keyword spec as shown for make-dsv-reader
@@ 31,7 32,7 @@ The delimiter-separated values format is
# Speed
-It is slightly faster than guile-csv for CSV files, with the bonus that it actually parses proper CSV files with CRLF line endings. This means a 35mb CSV file is parsed in about 3.3s using guile 2.9.4.
+It is slightly faster than guile-csv for CSV files, with the bonus that it actually parses proper CSV files with CRLF line endings. This means a 35mb CSV file is parsed in about 4s using guile 2.9.4. Python is twice as fast, due to it's csv reader being written in optimized and nicely buffered C.
# License
@@ 41,6 42,13 @@ LGPLv3. See the file header.
I was trying my best to use data-type specific comparisons, but apparently eqv? was faster (probably due to fewer type checks in the generated code). That yielded quite a speed increase. I will have to try to find other such nice little speedups.
+Re-add trimming.
+
+Enforce length of rows.
+
+Change the interface to allow composing with call-with-input-xxxx and the likes.
+
I tried using a bigger string buffer and using the same buffer for each instantiated reader, but that made it run slower than using a new buffer for each line.
Anyway, I would like to write some tests to make sure it outputs correct code. Then I would like to make it fast. After that, I would like to make it pretty.
+
@@ 23,7 23,7 @@
;; A streaming version of the rather nifte guile-csv written by Andy Wingo
;; and Nala Ginrut. It is changed to use a string buffer instead of a list
;; (with a new one per row, which happened to be the fastest). It also supports
-;; more features, such as user-configurable line endings (max 2 characters),
+;; more features, such as such as user configurable line separators,
;; escape characters and delimiters, althogh it will _not_ play well with
;; space-separated values when more than one space separates the values.
;; Parsing " " will return two empty strings.
@@ 51,11 51,17 @@
(loop (peek-char port)))))
;; Returns a line-reader.
-(define* (make-dsv-reader port
+(define* (make-dsv-reader port-or-string
#:key
(delimiter #\,)
- (newline #\newline)
+ (newline 'lf)
(escape #\" ))
+ (define port
+ (cond
+ ((port? port-or-string) port-or-string)
+ ((string? port-or-string) (open-input-string port-or-string))
+ (else (error "unsupported argemunt to make-dsv-reader. Expected port or string"))))
+
(define (getch) (read-char port))
(define (char-quote? ch)
@@ 65,19 71,30 @@
(or (eqv? #\" ch)
(eqv? escape ch)))
- (define (at-newline ch pos)
- (cond
- ((eqv? ch newline))
- ((string? newline)
- (if (string=? newline (string ch (peek-char port)))
- (begin
- (read-char port)
- #t)
- #f))
- (else #f)))
+ (define at-newline
+ (case newline
+ ((cr) (lambda (c) (eqv? c #\return)))
+ ((lf) (lambda (c) (eqv? c #\newline)))
+ ((crlf) (lambda (c pos)
+ (if (eqv? c #\return)
+ (let ((lf? (peek-char port)))
+ (cond ((eof-object? lf?) #t) ;; This should really be an error, but let's be nice!
+ ((eqv? lf? #\newline) (read-char port) #t)
+ (else #f)))
+ #f)))
+ ((lax)
+ (lambda (c)
+ (case c
+ ((#\newline) #t)
+ ((#\return)
+ (when (eqv? (peek-char port) #\newline)
+ (read-char port))
+ #t)
+ (else #f))))))
+
(define (finish-row str pos)
- (list (substring/copy str 0 pos)))
+ (list (substring str 0 pos)))
(define (read-string ch str pos)
(cond
@@ 106,7 123,7 @@
((eqv? delimiter quote?)
(cons (substring/copy str 0 pos) (read-any (getch) str 0)))
;; The row ended. Finish the row
- ((at-newline quote? pos)
+ ((at-newline quote?)
(finish-row str pos))))
(define (read-escaped quote? str pos)
@@ 123,7 140,7 @@
(if (zero? pos)
(read-string (getch) str 0)
(error "Quote in non-allowed place" (substring/copy str 0 pos) (string ch))))
- ((at-newline ch pos)
+ ((at-newline ch)
(finish-row str pos))
(else
(read-any (getch) (set-pos! str pos ch) (+ pos 1)))))