@@ 3,70 3,65 @@
(defclass stemword ()
((str :accessor str :initarg :str)
- (R1start :accessor R1start :initarg :R1start)
- (R2start :accessor R2start :initarg :R2start))
- (:default-initargs :R1start 0 :R2start 0))
+ (R1start :accessor R1start :initarg :R1start :initform 0)
+ (R2start :accessor R2start :initarg :R2start :initform 0)))
-(defgeneric replace-suffix (word suffix replacement)
- (:documentation "Replace a suffix and adjust R1start and R2start as needed."))
-
-(defgeneric strip-suffix (word n)
- (:documentation "Remove suffix `n` from the right-hand side of a word."))
+(defgeneric has-suffix-p (word suffix)
+ (:documentation "Predicate for whether `word` ends in `suffix`")
+ (:method ((word stemword) suffix)
+ (and (>= (length (str word)) (length suffix))
+ (search suffix (str word) :test #'string=
+ :start2 (- (length (str word)) (length suffix))))))
(defgeneric reset-R1R2 (word)
- (:documentation "Resets R1start and R2start to ensure they are within bounds of the current string."))
+ (:documentation "Resets R1start and R2start to ensure they are within bounds of the current string.")
+ (:method ((word stemword))
+ (let ((word-length (length (str word))))
+ (when (> (R1start word) word-length)
+ (setf (R1start word) word-length))
+ (when (> (R2start word) word-length)
+ (setf (R2start word) word-length)))))
+
+(defgeneric replace-suffix (word suffix replacement)
+ (:documentation "Replace a suffix and adjust R1start and R2start as needed.")
+ (:method ((word stemword) suffix replacement-str)
+ (when (has-suffix-p word suffix)
+ (let* ((length-no-suffix (- (length (str word)) (length suffix)))
+ (chopped (subseq (str word) 0 length-no-suffix)))
+ (setf (str word) (strcat chopped replacement-str))
+ (reset-R1R2 word)))))
+
+(defgeneric strip-suffix (word n)
+ (:documentation "Remove suffix `n` from the right-hand side of a word.")
+ (:method ((word stemword) (s string))
+ (setf (str word) (subseq (str word) 0 (- (length (str word)) (length s))))
+ (reset-R1R2 word)))
(defgeneric first-prefix (word prefixes)
- (:documentation "Return the first prefix found or nil"))
+ (:documentation "Return the first prefix found or nil")
+ (:method ((word stemword) prefixes)
+ (loop for prefix in prefixes
+ if (search prefix (str word) :test #'string= :end2 (min (length (str word))
+ (length prefix)))
+ return prefix)))
(defgeneric first-suffix (word suffixes)
- (:documentation "Return the first suffix found or nil"))
-
-(defgeneric has-suffix? (word suffix)
- (:documentation "Predicate for whether `word` ends in `suffix`"))
+ (:documentation "Return the first suffix found or nil")
+ (:method ((word stemword) suffixes)
+ (loop for suffix in suffixes
+ if (has-suffix-p word suffix)
+ return suffix)))
(defgeneric strip-n-chars (word n)
- (:documentation "Remove `n` characters from word"))
-
-(defmethod strip-n-chars ((word stemword) (n number))
- (setf (str word) (subseq (str word) 0 (- (length (str word)) n)))
- (reset-R1R2 word))
-
-(defmethod strip-suffix ((word stemword) (s string))
- (setf (str word) (subseq (str word) 0 (- (length (str word)) (length s))))
- (reset-R1R2 word))
-
-(defmethod reset-R1R2 ((word stemword))
- (let ((word-length (length (str word))))
- (if (> (R1start word) word-length)
- (setf (R1start word) word-length))
- (if (> (R2start word) word-length)
- (setf (R2start word) word-length))))
+ (:documentation "Remove `n` characters from word")
+ (:method ((word stemword) (n number))
+ (setf (str word) (subseq (str word) 0 (- (length (str word)) n)))
+ (reset-R1R2 word)))
-(defmethod replace-suffix ((word stemword) suffix replacement-str)
- (if (has-suffix? word suffix)
- (let* ((length-no-suffix (- (length (str word)) (length suffix)))
- (chopped (subseq (str word) 0 length-no-suffix)))
- (setf (str word) (concatenate 'string chopped replacement-str))
- (reset-R1R2 word))))
+(defun strcat (s1 s2)
+ (concatenate 'string s1 s2))
-(defmethod first-prefix ((word stemword) prefixes)
- (loop for prefix in prefixes
- if (search prefix (str word) :test #'string= :end2 (min (length (str word))
- (length prefix)))
- return prefix))
-
-(defmethod has-suffix? ((word stemword) suffix)
- (and (>= (length (str word)) (length suffix))
- (search suffix (str word) :test #'string=
- :start2 (- (length (str word)) (length suffix)))))
-
-(defmethod first-suffix ((word stemword) suffixes)
- (loop for suffix in suffixes
- if (has-suffix? word suffix)
- return suffix))
-
-(defun lower-vowel? (char)
+(defun lower-vowel-p (char)
(member char (list #\a #\e #\i #\o #\u #\y)))
;; Finds the region after the first non-vowel following a vowel, or a
@@ 76,39 71,40 @@
(defun vnv-suffix (word offset)
(let ((word (str word)))
(loop for char across (subseq word offset)
- and j from (1+ offset)
- if (and (lower-vowel? char)
- (< j (length word))
- (not (lower-vowel? (char word j))))
- return (1+ j)
- finally (return (length word)))))
+ for j from (1+ offset)
+ if (and (lower-vowel-p char)
+ (< j (length word))
+ (not (lower-vowel-p (char word j))))
+ return (1+ j)
+ finally (return (length word)))))
;;; common (english)
(defun normalize-apostrophes (word)
(loop for c across (str word) and i from 0
- if (member c (list #\RIGHT_SINGLE_QUOTATION_MARK
- #\LEFT_SINGLE_QUOTATION_MARK
- #\SINGLE_HIGH-REVERSED-9_QUOTATION_MARK))
- do (setf (char (str word) i) #\')))
+ if (member c (list #\RIGHT_SINGLE_QUOTATION_MARK
+ #\LEFT_SINGLE_QUOTATION_MARK
+ #\SINGLE_HIGH-REVERSED-9_QUOTATION_MARK))
+ do (setf (char (str word) i) #\')))
(defun trim-left-apostrophes (word)
- (if (char= (char (str word) 0) #\')
- (setf (str word) (subseq (str word) 1)
- (R1start word) (1- (R1start word))
- (R2start word) (1- (R2start word)))))
+ (when (char= (char (str word) 0) #\')
+ (setf (str word) (subseq (str word) 1)
+ (R1start word) (1- (R1start word))
+ (R2start word) (1- (R2start word)))))
(defun capitalize-Ys (word)
- (loop for c across (str word) and i from 0
- if (and (char= c #\y)
- (or (= i 0)
- (lower-vowel? (char (str word) (1- i)))))
- do (setf (char (str word) i) #\Y)))
+ (loop for c across (str word)
+ for i from 0
+ if (and (char= c #\y)
+ (or (= i 0)
+ (lower-vowel-p (char (str word) (1- i)))))
+ do (setf (char (str word) i) #\Y)))
(defun uncapitalize-Ys (word)
(loop for c across (str word) and i from 0
- if (char= c #\Y)
- do (setf (char (str word) i) #\y)))
+ if (char= c #\Y)
+ do (setf (char (str word) i) #\y)))
;; R1 is the region after the first non-vowel following a vowel,
;; or is the null region at the end of the word if there is no
@@ 131,11 127,11 @@
(defmacro string-switch (arg (&rest key-values))
`(cond
,@(loop for (k v) in key-values
- collect
- `(,(etypecase k
- (CONS `(loop for s in ',k thereis (string= ,arg s)))
- (STRING `(string= ,arg ,k))
- (BOOLEAN `,k))
+ collect
+ `(,(etypecase k
+ (CONS `(loop for s in ',k thereis (string= ,arg s)))
+ (STRING `(string= ,arg ,k))
+ (BOOLEAN `,k))
,v))))
(defun stem-special-string (str)
@@ 174,8 170,8 @@
;; (b) a vowel at the beginning of the word followed by a non-vowel.
(defun ends-short-syllable (word i)
(cond ((= i 2)
- (and (lower-vowel? (char (str word) 0))
- (not (lower-vowel? (char (str word) 1)))))
+ (and (lower-vowel-p (char (str word) 0))
+ (not (lower-vowel-p (char (str word) 1)))))
((>= i 3)
(let ((s1 (char (str word) (- i 1)))
@@ 183,10 179,11 @@
(s3 (char (str word) (- i 3))))
;; Check for a vowel followed by a non-vowel other than w, x or Y
;; and preceded by a non-vowel.
- (and (not (lower-vowel? s1))
+ (and (not (lower-vowel-p s1))
(not (member s1 (list #\w #\x #\Y)))
- (lower-vowel? s2)
- (not (lower-vowel? s3)))))))
+ (lower-vowel-p s2)
+ (not (lower-vowel-p s3)))))
+ (t nil)))
(defun is-short-word (word)
(unless (< (R1start word) (length (str word)))
@@ 195,8 192,8 @@
;; Step 0 is to strip off apostrophes and "s".
(defun step-0 (word)
(let ((suffix (first-suffix word (list "'s'" "'s" "'"))))
- (if suffix
- (strip-suffix word suffix))))
+ (when suffix
+ (strip-suffix word suffix))))
;; Step 1a is normalization of various special "s"-endings.
(defun step-1a (word)
@@ 215,51 212,52 @@
;; immediately before the s (so gas and this retain the s,
;; gaps and kiwis lose it)
("s" (loop for char across (subseq (str word) 0 (max 0 (- word-len 2)))
- if (lower-vowel? char)
- return (strip-suffix word suffix)))))))
+ if (lower-vowel-p char)
+ return (strip-suffix word suffix)))))))
;; Step 1b is the normalization of various "ly" and "ed" sufficies.
(defun step-1b (word)
(let ((suffix (first-suffix word (list "eedly" "ingly" "edly" "ing" "eed" "ed"))))
(string-switch suffix
((("eed" "eedly")
- (if (<= (length suffix) (- (length (str word)) (R1start word)))
- (replace-suffix word suffix "ee")))
+ (when (<= (length suffix) (- (length (str word)) (R1start word)))
+ (replace-suffix word suffix "ee")))
(("ed" "edly" "ing" "ingly")
(let* ((root-len (- (length (str word)) (length suffix)))
(has-lower-vowel (loop for char across (subseq (str word) 0 root-len)
- thereis (lower-vowel? char))))
- (if has-lower-vowel
- (let ((original-R1start (R1start word))
- (original-R2start (R2start word)))
- (strip-suffix word suffix)
- (let ((new-suffix (first-suffix word (list "at" "bl" "iz" "bb"
- "dd" "ff" "gg" "mm"
- "nn" "pp" "rr" "tt"))))
- (if new-suffix
- (string-switch new-suffix
- ((("at" "bl" "iz")
- ;; If the word ends "at", "bl" or "iz" add "e"
- (replace-suffix word new-suffix (concatenate 'string new-suffix "e")))
+ thereis (lower-vowel-p char))))
+ (when has-lower-vowel
+ (let ((original-R1start (R1start word))
+ (original-R2start (R2start word)))
+ (strip-suffix word suffix)
+ (let ((new-suffix (first-suffix word (list "at" "bl" "iz" "bb"
+ "dd" "ff" "gg" "mm"
+ "nn" "pp" "rr" "tt"))))
+ (cond (new-suffix
+ (string-switch new-suffix
+ ((("at" "bl" "iz")
+ ;; If the word ends "at", "bl" or "iz" add "e"
+ (replace-suffix word new-suffix (strcat new-suffix "e")))
- (("bb" "dd" "ff" "gg" "mm" "nn" "pp" "rr" "tt")
- ;; If the word ends with a double remove the last letter.
- (strip-n-chars word 1))))
+ (("bb" "dd" "ff" "gg" "mm" "nn" "pp" "rr" "tt")
+ ;; If the word ends with a double remove the last letter.
+ (strip-n-chars word 1)))))
+
;; otherwise, if the word is short, add "e"
- (if (is-short-word word)
- (setf (str word) (concatenate 'string (str word) "e")
- (R1start word) (length (str word))
- (R2start word) (length (str word)))))
+ ((is-short-word word)
+ (setf (str word) (strcat (str word) "e")
+ (R1start word) (length (str word))
+ (R2start word) (length (str word)))))
- ;; Because we did a double replacement, we need
- ;; to fix R1 and R2 manually.
- (if (< original-R1start (length (str word)))
- (setf (R1start word) original-R1start)
- (setf (R1start word) (length (str word))))
- (if (< original-R2start (length (str word)))
- (setf (R2start word) original-R2start)
- (setf (R2start word) (length (str word)))))))))))))
+ ;; Because we did a double replacement, we need
+ ;; to fix R1 and R2 manually.
+ (if (< original-R1start (length (str word)))
+ (setf (R1start word) original-R1start)
+ (setf (R1start word) (length (str word))))
+ (if (< original-R2start (length (str word)))
+ (setf (R2start word) original-R2start)
+ (setf (R2start word) (length (str word)))))))))))))
;; Step 1c is the normalization of various "y" endings.
(defun step-1c (word)
@@ 267,11 265,11 @@
;; Replace suffix y or Y by i if preceded by a non-vowel which is
;; not the first letter of the word:
;; cry -> cri, by -> by, say -> say
- (if (and (> rs-length 2)
- (or (char= #\y (char (str word) (- rs-length 1)))
- (char= #\Y (char (str word) (- rs-length 1))))
- (not (lower-vowel? (char (str word) (- rs-length 2)))))
- (setf (char (str word) (- rs-length 1)) #\i))))
+ (when (and (> rs-length 2)
+ (or (char= #\y (char (str word) (1- rs-length)))
+ (char= #\Y (char (str word) (1- rs-length))))
+ (not (lower-vowel-p (char (str word) (- rs-length 2)))))
+ (setf (char (str word) (1- rs-length)) #\i))))
;; Step 2 is the stemming of various endings found in R1 including
;; "al", "ness", and "li"
@@ 283,32 281,32 @@
"izer" "bli" "ogi" "li")))
(word-len (length (str word)))
(cdeghkmnrt (loop for c across "cdeghkmnrt" collect c)))
- (if (and suffix
- (<= (length suffix) (- word-len (R1start word))))
- (string-switch suffix
- (("li" (if (and (>= word-len 3)
+ (when (and suffix
+ (<= (length suffix) (- word-len (R1start word))))
+ (string-switch suffix
+ (("li" (when (and (>= word-len 3)
(member (char (str word) (- word-len 3)) cdeghkmnrt))
- (strip-suffix word suffix)))
+ (strip-suffix word suffix)))
- ("ogi" (if (and (>= word-len 4)
+ ("ogi" (when (and (>= word-len 4)
(char= (char (str word) (- word-len 4)) #\l))
- (replace-suffix word suffix "og")))
+ (replace-suffix word suffix "og")))
- (t (replace-suffix word suffix (string-switch suffix
- (("tional" "tion")
- ("enci" "ence")
- ("anci" "ance")
- ("abli" "able")
- ("entli" "ent")
- ("fulness" "ful")
- ("fulli" "ful")
- ("lessli" "less")
- (("izer" "ization") "ize")
- (("ousli" "ousness") "ous")
- (("biliti" "bli") "ble")
- (("iveness" "iviti") "ive")
- (("alism" "aliti" "alli") "al")
- (("ational" "ation" "ator") "ate"))))))))))
+ (t (replace-suffix word suffix (string-switch suffix
+ (("tional" "tion")
+ ("enci" "ence")
+ ("anci" "ance")
+ ("abli" "able")
+ ("entli" "ent")
+ ("fulness" "ful")
+ ("fulli" "ful")
+ ("lessli" "less")
+ (("izer" "ization") "ize")
+ (("ousli" "ousness") "ous")
+ (("biliti" "bli") "ble")
+ (("iveness" "iviti") "ive")
+ (("alism" "aliti" "alli") "al")
+ (("ational" "ation" "ator") "ate"))))))))))
;; Step 3 is the stemming of various longer sufficies
(defun step-3 (word)
@@ 318,16 316,16 @@
(word-len (length (str word))))
(unless (or (> (length suffix) (- word-len (R1start word)))
(not suffix))
- (string-switch suffix
- (("ative" (if (>= (- word-len (R2start word)) 5)
- (strip-suffix word suffix)))
+ (string-switch suffix
+ (("ative" (when (>= (- word-len (R2start word)) 5)
+ (strip-suffix word suffix)))
- (t (replace-suffix word suffix (string-switch suffix
- (("alize" "al")
- ("tional" "tion")
- ("ational" "ate")
- (("ful" "ness") "")
- (("icate" "iciti" "ical") "ic"))))))))))
+ (t (replace-suffix word suffix (string-switch suffix
+ (("alize" "al")
+ ("tional" "tion")
+ ("ational" "ate")
+ (("ful" "ness") "")
+ (("icate" "iciti" "ical") "ic"))))))))))
;; Step 4 search for the longest of the following and if found in R2, act
(defun step-4 (word)
@@ 337,12 335,12 @@
"al" "er" "ic")))
(word-len (length (str word))))
(unless (or (> (length suffix) (- word-len (R2start word)))
- (not suffix))
- (string-switch suffix
- (("ion" (if (and (>= word-len 4) (or (char= (char (str word) (- word-len 4)) #\s)
+ (not suffix))
+ (string-switch suffix
+ (("ion" (when (and (>= word-len 4) (or (char= (char (str word) (- word-len 4)) #\s)
(char= (char (str word) (- word-len 4)) #\t)))
- (strip-suffix word suffix)))
- (t (strip-suffix word suffix)))))))
+ (strip-suffix word suffix)))
+ (t (strip-suffix word suffix)))))))
;; Step 5 is the stemming of "e" and "l" sufficies found in R2
(defun step-5 (word)
@@ 375,17 373,15 @@
(downcased (string-downcase trimmed))
(w (make-instance 'stemword :str downcased))
(special-string (stem-special-string str)))
- (if special-string
- special-string
- (progn
- (preprocess w)
- (step-0 w)
- (step-1a w)
- (step-1b w)
- (step-1c w)
- (step-2 w)
- (step-3 w)
- (step-4 w)
- (step-5 w)
- (postprocess w)
- (str w)))))
+ (cond (special-string special-string)
+ (t (preprocess w)
+ (step-0 w)
+ (step-1a w)
+ (step-1b w)
+ (step-1c w)
+ (step-2 w)
+ (step-3 w)
+ (step-4 w)
+ (step-5 w)
+ (postprocess w)
+ (str w)))))