Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ;;; [2018-06-13] Challenge #363 [Intermediate] Word Hy-phen-a-tion By Com-put-er
- ;;; /r/dailyprogrammer https://redd.it/8qxpqd
- ;;;
- ;;; hyphenate.lisp - hyphenate words using Liang's Algorithm
- ;;;
- ;;; uses TeX patterns but not trie data structure to search for them
- ;;;
- ;;; DOES NOT validate that patterns are valid (i.e. number before starting ".")
- ;;; fails this test from challenge page: ceremony => cer-e-mo-ny
- ;;;
- ;;; more failure: $ cat our-hyphens.txt | tr -d [:alpha:] | sort | uniq -c
- ;;;
- ;;; no. of - | should be | but we get
- ;;; -------- * --------- * ----------
- ;;; 0 | 21830 | 22464
- ;;; 1 | 56852 | 56554
- ;;; 2........+ 50452.....| 50078.....
- ;;; 3 | 26630 | 26650
- ;;; 4 | 11751 | 11757
- ;;; 5........+ 4044.....| 4055.....
- ;;; 6 | 1038 | 1037
- ;;; 7 | 195 | 194
- ;;; 8........+ 30.....| 30.....
- ;;; 9 | 1 | 1
- ;;; ==================================
- (defpackage :Liang
- (:use :common-lisp)
- (:export :load-hyphen-patterns :hyphenate :join))
- (in-package :Liang)
- ;;a day or so were wasted not understanding Common Lisp #'subseq
- ;;the &optional argument is 'END not 'len as in every other scripting language... :/
- (defun my-subseq (seq start len)
- (break)
- (if (>= start (length seq))
- (format t "XXX! seq=~a, start=~a, len=~a~%" seq start len)
- (subseq seq start len)))
- (defparameter *substr-lengths* nil) ;facilitates (dolist (x *substr-lengths*) ...) iteration
- (defparameter *patterns* (make-hash-table :test #'equal)) ;Liang patterns indexed by prefix substrings
- (defun strip-digits (str)
- (remove-if #'digit-char-p str))
- ;https://lispnews.wordpress.com/2018/01/15/reading-a-file-line-by-line-revisited/
- ;http://sodaware.sdf.org/notes/cl-read-file-into-string/
- (defun load-hyphen-patterns (file-path)
- ;;reset globals in case opening another pattern file
- (let ((-min-substr-length- 99)
- (-max-substr-length- -99))
- (setq *substr-lengths* nil)
- (setq *patterns* (make-hash-table :test #'equal))
- (with-open-file (fin file-path)
- (loop for pat = (read-line fin nil)
- while pat do
- (let* ((key (strip-digits pat))
- (len (length key)))
- (setf (gethash key *patterns*) pat)
- (when (< len -min-substr-length-) (setq -min-substr-length- len))
- (when (> len -max-substr-length-) (setq -max-substr-length- len)))))
- (dotimes (x (1+ -max-substr-length-) t)
- (when (>= x -min-substr-length-) (push x *substr-lengths*)))))
- (defun generate-prefixes (str start)
- (let ((len (- (length str) start))
- (prefixes nil))
- (dolist (x *substr-lengths* prefixes)
- (when (< x len) (push (subseq str start (+ start x)) prefixes)))))
- ;;returns an updated marks array, eventually the slots with odd numbers will be hyphen breaks
- ;;marks is updated from starting offset as the string is iterated through
- (defun scan-patterns (str marks start)
- (let ((prefixes (generate-prefixes str start)))
- (dolist (key prefixes marks)
- (let ((pattern (gethash key *patterns*))
- (pos start)
- (n nil))
- (when pattern
- (dotimes (x (length pattern) marks)
- (setq n (digit-char-p (char pattern x)))
- (if n
- (when (> n (aref marks pos)) (setf (aref marks pos) n))
- (incf pos))))))))
- (defun analyze-word (word)
- (let* ((len (length word))
- (-word- (concatenate 'string "." word "."))
- (-marks- (make-array (length -word-) :initial-element 0)))
- (dotimes (i (length -word-) (subseq -marks- 1 (+ 1 len)))
- (setq -marks- (scan-patterns -word- -marks- i)))))
- ;;--- a few helper functions and then hyphenate ---
- ;;this function is broken, but the magic snippet works as an inline statement...
- ;;let's make it a macro in case that helps then
- (defmacro join (&rest strings)
- `(format nil "~{~a~^-~}" ,@strings))
- ;;there should be a way to iterate a filtered marks array but we lisp newbies...
- (defun split-by-pos-list (str start pos-list)
- (let ((end (car pos-list)))
- (when start
- (cons (subseq str start end)
- (split-by-pos-list str end (cdr pos-list))))))
- (defun generate-pos-list (marks-array)
- (let ((ls nil)
- (len (length marks-array)))
- (dotimes (x len)
- (let ((pos (- len x 1)))
- (when (oddp (aref marks-array pos))
- (push pos ls))))
- ls))
- (defun hyphenate (word)
- (let* ((marks (analyze-word word))
- (pos-list (generate-pos-list marks))
- (syllables (split-by-pos-list word 0 pos-list)))
- ;;without this words like "programmer" are okay but "capable" isn't...
- ;;probably a bug elsewhere!
- (remove-if #'(lambda (x) (equal x "")) syllables)))
- ;;;------------ MAIN ------------
- (in-package :user)
- (defconstant +how-show+ 'stdout)
- (defconstant +start-time+ (get-internal-run-time))
- ;;takes between 0.05 to 0.09 seconds
- ;;clisp runs much faster than ecl but the file load is comparable
- (Liang:load-hyphen-patterns #P"/sdcard/tex-hyphenation-patterns.txt")
- ;;takes about ??? seconds
- ;;clisp: approximately 115 seconds 'stdout
- ;; approximately 103 seconds 'dev/null
- ;;ecl: approximately 40 seconds 'stdout <-- wall time over 4 minutes, the internal clock gives "runtime" not io bound :/
- ;;ecl: approximately 38 seconds 'dev/null
- ;;
- ;;ecl mobile (on local device, no network larency):
- ;; with 'stdout
- ;; All that in 166.44 seconds!
- ;; real 5m45.929s
- ;; user 2m47.250s
- ;; sys 2m33.160s
- ;; with 'dev/null
- ;; All that in 132.17 seconds!
- ;; real 4m11.379s
- ;; user 2m13.020s
- ;; sys 1m58.120s
- (unless (eq +how-show+ 'repl)
- (with-open-file (word-list #P"/sdcard/enable1.txt")
- (loop for word = (read-line word-list nil)
- while word do
- (case +how-show+
- ;#'Liang:join isn't working correctly... we just print the syllable list
- ('stdout (write-line (Liang:join (Liang:hyphenate word))))
- ('dev/null (Liang:hyphenate word))
- (otherwise (error "set +how-out+ to either 'stdout or 'dev/null")))))
- ) ;end non-repl
- (defconstant +finish-time+ (get-internal-run-time))
- (defconstant +diff-time+ (- +finish-time+ +start-time+))
- (defconstant +seconds+ (/ +diff-time+ internal-time-units-per-second))
- (format t "All that in ~f seconds!~%" +seconds+)
Add Comment
Please, Sign In to add comment