Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | updated irregex |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
a8011c42a0fd1c40ecd224f8f8807f16 |
User & Date: | aldo 2024-09-27 18:19:17 |
Context
2024-09-27
| ||
18:29 | merge 7164164ed6 check-in: 6834d66b60 user: aldo tags: trunk | |
18:19 | updated irregex check-in: a8011c42a0 user: aldo tags: trunk | |
2019-04-11
| ||
18:15 | add sql field to sqlite3 statement check-in: 538cf8286e user: aldo tags: trunk | |
Changes
Changes to irregex.sls.
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 .. 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 ... 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 ... 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 ... 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 ... 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 ... 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 .... 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 .... 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 .... 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 .... 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 .... 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 .... 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 .... 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 .... 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 .... 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 .... 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 .... 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 .... 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 .... 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 .... 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 .... 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 .... 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 |
irregex-lengths irregex-names irregex-num-submatches irregex-extract irregex-split sre->cset) (import (except (rnrs) error find filter remove) (rnrs r5rs) (rnrs mutable-pairs) (rnrs mutable-strings) (only (chezscheme) include)) ;; definition from irregex (define (error msg . args) (display msg) (for-each (lambda (x) (display " ") (write x)) args) (newline) (0)) ;;;; irregex.scm -- IrRegular Expressions ;; ;; Copyright (c) 2005-2015 Alex Shinn. All rights reserved. ;; BSD-style license: http://synthcode.com/license.txt ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; At this moment there was a loud ring at the bell, and I could ;; hear Mrs. Hudson, our landlady, raising her voice in a wail of ;; expostulation and dismay. ;; ................................................................................ ;; "No, it's not quite so bad as that. It is the unofficial ;; force, -- the Baker Street irregulars." ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Notes ;; ;; This code should not require any porting - it should work out of ;; the box in any R[45]RS Scheme implementation. Slight modifications ;; are needed for R6RS (a separate R6RS-compatible version is included ;; in the distribution as irregex-r6rs.scm). ;; ;; The goal of portability makes this code a little clumsy and ;; inefficient. Future versions will include both cleanup and ;; performance tuning, but you can only go so far while staying ;; portable. AND-LET*, SRFI-9 records and custom macros would've been ;; nice. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History ;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches ;; 0.9.3: 2014/07/01 - R7RS library ;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns ;; 0.9.1: 2012/11/27 - various accumulated bugfixes ;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex. ;; 0.8.3: 2011/12/18 - various accumulated bugfixes ;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate ................................................................................ ;; For look-behind searches, wrap an existing chunker such that it ;; returns the same results but ends at a given point. (define (wrap-end-chunker cnk src i) (make-irregex-chunker (lambda (x) (and (not (eq? x src)) ((chunker-get-next cnk) x))) (chunker-get-str cnk) (chunker-get-start cnk) (lambda (x) (if (eq? x src) i ((chunker-get-end cnk) x))) (chunker-get-substring cnk) (chunker-get-subchunk cnk))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; String Utilities ;; Unicode version (skip surrogates) ................................................................................ `(/ ,(integer->char 0) ,(integer->char #xD7FF) ,(integer->char #xE000) ,(integer->char #x10FFFF))) ;; ASCII version, offset to not assume 0-255 ;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) ;; set to #f to ignore even an explicit request for utf8 handling (define *allow-utf8-mode?* #t) ;; (define *named-char-properties* '()) (define (string-scan-char str c . o) (let ((end (string-length str))) (let scan ((i (if (pair? o) (car o) 0))) (cond ((= i end) #f) ................................................................................ (j (if at? (+ i 2) (+ i 1)))) (read j (lambda (sexp j) (k (list u sexp) j))))) ((#\") (let scan ((from (+ i 1)) (i (+ i 1)) (res '())) (define (collect) (if (= from i) res (cons (substring str from i) res))) (if (>= i end) (error "unterminated string in embeded SRE" str) (case (string-ref str i) ((#\") (k (string-cat-reverse (collect)) (+ i 1))) ((#\\) (scan (+ i 1) (+ i 2) (collect))) (else (scan from (+ i 1) res)))))) ((#\#) (case (string-ref str (+ i 1)) ((#\;) ................................................................................ (error "unterminated (*'...) SRE escape" str) (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) (error "bad regexp syntax: (*FOO) not supported" str))) (else ;; (?...) case (case (string-ref str (+ i 2)) ((#\#) (let ((j (string-scan-char str #\) (+ i 3)))) (lp (+ j i) (+ j 1) flags (collect) st))) ((#\:) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) ((#\=) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(look-ahead) (save))) ((#\!) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) ................................................................................ ;;((#\R) ; recursion ;; ) ((#\() (cond ((>= (+ i 3) end) (error "unterminated parenthesis in regexp" str)) ((char-numeric? (string-ref str (+ i 3))) (let* ((j (string-scan-char str #\) (+ i 3))) (n (string->number (substring str (+ i 3) j)))) (if (not n) (error "invalid conditional reference" str) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,n if) (save))))) ((char-alphabetic? (string-ref str (+ i 3))) (let* ((j (string-scan-char str #\) (+ i 3))) (s (string->symbol (substring str (+ i 3) j)))) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,s if) (save)))) (else (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) '(if) (save))))) ((#\{) (error "unsupported Perl-style cluster" str)) (else (let ((old-flags flags)) ................................................................................ (let ((res (collect/single))) (cond ((null? res) (error "{ can't follow empty pattern")) (else (let* ((x (car res)) (tail (cdr res)) (j (string-scan-char str #\} (+ i 1))) (s2 (string-split-char (substring str (+ i 1) j) #\,)) (n (string->number (car s2))) (m (and (pair? (cdr s2)) (string->number (cadr s2))))) (cond ((or (not n) ................................................................................ (let ((lo-ls (char->utf8-list lo)) (hi-ls (char->utf8-list hi))) (if (not (= (length lo-ls) (length hi-ls))) (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) (unicode-range-up-to hi-ls))) (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) (cond ((null? lo-ls) '()) ((= (car lo-ls) (car hi-ls)) (sre-sequence (list (integer->char (car lo-ls)) (lp (cdr lo-ls) (cdr hi-ls))))) ((= (+ (car lo-ls) 1) (car hi-ls)) (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-up-to hi-ls)))) (else (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-middle lo-ls hi-ls) (unicode-range-up-to hi-ls))))))))) ................................................................................ ((any) '(* any)) ((nonl) '(* nonl)) (else (cons '* (map rec (cdr sre)))))) (else (cons (car sre) (map rec (cdr sre)))))) (else (case sre ((any) 'utf8-any) ((nonl) 'utf8-nonl) (else (if (and utf8? (char? sre) (high-char? sre)) (sre-sequence (map integer->char (char->utf8-list sre))) sre))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Compilation ................................................................................ (and (pair? sre) (or (memq (car sre) '(* +)) (and (memq (car sre) '($ submatch => submatch-named seq :)) (pair? (cdr sre)) (null? (cddr sre)) (sre-repeater? (cadr sre)))))) (define (sre-searcher? sre) (if (pair? sre) (case (car sre) ((* +) (sre-any? (sre-sequence (cdr sre)))) ((seq : $ submatch => submatch-named) (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) ((or) (every sre-searcher? (cdr sre))) (else #f)) (eq? 'bos sre))) (define (sre-consumer? sre) (if (pair? sre) (case (car sre) ((* +) (sre-any? (sre-sequence (cdr sre)))) ((seq : $ submatch => submatch-named) (and (pair? (cdr sre)) (sre-consumer? (last sre)))) ((or) (every sre-consumer? (cdr sre))) (else #f)) (eq? 'eos sre))) (define (sre-has-submatches? sre) (and (pair? sre) (or (memq (car sre) '($ submatch => submatch-named)) (if (eq? 'posix-string (car sre)) (sre-has-submatches? (string->sre (cadr sre))) (any sre-has-submatches? (cdr sre)))))) ................................................................................ (cond ((and finalizer (not new-finalizer)) (finalize! finalizer memory matches))) (for-each (lambda (s) (let ((slot (vector-ref memory (cdr s))) (chunk&position (cons src (+ i 1)))) (vector-set! slot (car s) chunk&position))) (cdr cmds)) (for-each (lambda (c) (let* ((tag (vector-ref c 0)) (ss (vector-ref memory (vector-ref c 1))) (ds (vector-ref memory (vector-ref c 2)))) (vector-set! ds tag (vector-ref ss tag)))) (car cmds))))) (if new-finalizer (lp2 (+ i 1) next src (+ i 1) new-finalizer) (lp2 (+ i 1) next res-src res-index #f)))) (res-src (cond (index (irregex-match-end-chunk-set! matches index res-src) ................................................................................ (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) (punct . punctuation) (graphic . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) (graph . graphic) (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) (whitespace . (or blank #\newline)) (space . whitespace) (white . whitespace) (printing or graphic whitespace) (print . printing) ;; XXXX we assume a (possibly shifted) ASCII-based ordering (control . (/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (- (char->integer #\space) 1)))) (cntrl . control) (hex-digit . (or numeric (/ #\a #\f #\A #\F))) ................................................................................ (ipv6-address . (seq (** 0 4 hex-digit) (** 1 7 #\: (? #\:) (** 0 4 hex-digit)))) (ip-address . (or ipv4-address ipv6-address)) (domain-atom . (+ (or alphanumeric #\_ #\-))) (domain . (seq domain-atom (+ #\. domain-atom))) ;; XXXX now anything can be a top-level domain, but this is still handy (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org" "aero" "biz" "coop" "info" "museum" "name" "pro" (= 2 alpha)))) (domain/common . (seq (+ domain-atom #\.) top-level-domain)) ;;(email-local-part . (seq (+ (or (~ #\") string)))) (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+))) (email . (seq email-local-part #\@ domain)) (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\; (seq "%" hex-digit hex-digit))) (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\& (seq "%" hex-digit hex-digit))) (http-url . (w/nocase "http" (? "s") "://" (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]") (? ":" (+ numeric)) ;; port ;; path (? "/" (* (or url-char "/")) (? "?" (* url-char)) ;; query (? "#" (? (* url-char) url-final-char)) ;; fragment ))) ................................................................................ next)) (a (and b (lp (list (cadar ls)) (new-state-number (max b next)) flags next)))) (and a (let ((c (add-state! (new-state-number a) '()))) (nfa-add-epsilon! buf c a #f) (nfa-add-epsilon! buf c b #f) c))))))) ((?) (let ((next (lp (cdr ls) n flags next))) ;; insert an epsilon transition directly to next ................................................................................ ;; closure, but has different tag value mappings. ;; If found, calculate reordering commands so we can map the closure ;; to that state instead of adding a new DFA state. ;; This is completely handwaved away in Laurikari's paper (it basically ;; says "insert reordering algorithm here"), so this code was constructed ;; after some experimentation. In other words, bugs be here. (define (find-reorder-commands-internal nfa closure dfa-states) (let ((num-states (nfa-num-states nfa)) (num-tags (nfa-num-tags nfa)) (closure-summary (mst-mappings-summary closure))) (let lp ((dfa-states dfa-states)) (if (null? dfa-states) #f (if (not (mst-same-states? (caar dfa-states) closure)) (lp (cdr dfa-states)) (let lp2 ((state-summary (mst-mappings-summary (caar dfa-states))) ................................................................................ (next cnk init src str i end matches (lambda () (body cnk init src str i end matches fail)))))) ((*) (cond ((sre-empty? (sre-sequence (cdr sre))) (error "invalid sre: empty *" sre)) (else (letrec ((body (lp (sre-sequence (cdr sre)) n flags (lambda (cnk init src str i end matches fail) (body cnk init src str i end matches (lambda () (next cnk init src str i end matches fail) )))))) (lambda (cnk init src str i end matches fail) (body cnk init src str i end matches (lambda () (next cnk init src str i end matches fail)))))))) ((*?) (cond ((sre-empty? (sre-sequence (cdr sre))) ................................................................................ (body cnk init src str i end matches fail) )))))) (lambda (cnk init src str i end matches fail) (next cnk init src str i end matches (lambda () (body cnk init src str i end matches fail)))))))) ((+) (lp (sre-sequence (cdr sre)) n flags (rec (list '* (sre-sequence (cdr sre)))))) ((=) (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) ((>=) (rec `(** ,(cadr sre) #f ,@(cddr sre)))) ((**) (cond ((or (and (number? (cadr sre)) ................................................................................ ((bos) (lambda (cnk init src str i end matches fail) (if (and (eq? src (car init)) (eqv? i (cdr init))) (next cnk init src str i end matches fail) (fail)))) ((bol) (lambda (cnk init src str i end matches fail) (if (or (and (eq? src (car init)) (eqv? i (cdr init))) (and (> i ((chunker-get-start cnk) src)) (eqv? #\newline (string-ref str (- i 1))))) (next cnk init src str i end matches fail) (fail)))) ((bow) (lambda (cnk init src str i end matches fail) (if (and (or (if (> i ((chunker-get-start cnk) src)) (not (char-alphanumeric? (string-ref str (- i 1)))) (let ((ch (chunker-prev-char cnk src end))) (and ch (not (char-alphanumeric? ch))))) (and (eq? src (car init)) (eqv? i (cdr init)))) (if (< i end) (char-alphanumeric? (string-ref str i)) (let ((next ((chunker-get-next cnk) src))) (and next (char-alphanumeric? (string-ref ((chunker-get-str cnk) next) ((chunker-get-start cnk) next))))))) ................................................................................ (init-src (list str start end)) (init (cons init-src start))) (if (not (and (integer? start) (exact? start))) (error "irregex-fold: not an exact integer" start)) (if (not (and (integer? end) (exact? end))) (error "irregex-fold: not an exact integer" end)) (irregex-match-chunker-set! matches irregex-basic-string-chunker) (let lp ((src init-src) (i start) (acc knil)) (if (>= i end) (finish i acc) (let ((m (irregex-search/matches irx irregex-basic-string-chunker init src i matches))) (if (not m) (finish i acc) (let ((j (%irregex-match-end-index m 0))) (if (= j i) ;; skip one char forward if we match the empty string (lp (list str (+ j 1) end) (+ j 1) acc) (let ((acc (kons i m acc))) (irregex-reset-matches! matches) ;; no need to continue looping if this is a ;; searcher - it's already consumed the only ;; available match (if (flag-set? (irregex-flags irx) ~searcher?) (finish j acc) (lp (list str j end) j acc))))))))))) (define (irregex-fold irx kons . args) (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons)) (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) (apply irregex-fold/fast irx kons2 args))) (define (irregex-fold/chunked/fast irx kons knil cnk start . o) ................................................................................ (if (and (eq? end-src start) (= end-index i)) (if (>= end-index ((chunker-get-end cnk) end-src )) (let ((next ((chunker-get-next cnk) end-src))) (lp next ((chunker-get-start cnk) next) acc)) (lp end-src (+ end-index 1) acc)) (let ((acc (kons start i m acc))) (irregex-reset-matches! matches) ;; no need to continue looping if this is a ;; searcher - it's already consumed the only ;; available match (if (flag-set? (irregex-flags irx) ~searcher?) (finish end-src end-index acc) (lp end-src end-index acc))))))))))) (define (irregex-fold/chunked irx kons . args) (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons)) (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) (apply irregex-fold/chunked/fast irx kons2 args))) ................................................................................ (define (irregex-replace/all irx str . o) (if (not (string? str)) (error "irregex-replace/all: not a string" str)) (irregex-fold/fast irx (lambda (i m acc) (let ((m-start (%irregex-match-start-index m 0))) (append (irregex-apply-match m o) (if (>= i m-start) acc (cons (substring str i m-start) acc))))) '() str (lambda (i acc) (let ((end (string-length str))) (string-cat-reverse (if (>= i end) acc ................................................................................ (define (irregex-split irx str . o) (if (not (string? str)) (error "irregex-split: not a string" str)) (let ((start (if (pair? o) (car o) 0)) (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) (irregex-fold/fast irx (lambda (i m a) (if (= i (%irregex-match-start-index m 0)) a (cons (substring str i (%irregex-match-start-index m 0)) a))) '() str (lambda (i a) (reverse (if (= i end) a (cons (substring str i end) a)))) start end))) ) |
> | | | > > > > > > > > > > > > > > > > > > > > > | > > > > | > | > > | | > > | | | | | | > > | | | | > < < | > | | | | < | | > > > > > > > > > > > > > | | | | | | | < > > > > > | | | | | > | > | | | | | | < | < < | < < < < < < < > > > > > > | | | < > > > > > > < | | > > | | | | < | | | | | | | | | | | | | | | < < < | < | > | > | | > | | > > > > > > > > > > > > |
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 .. 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 ... 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 ... 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 ... 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 ... 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 ... 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 .... 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 .... 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 .... 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 .... 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 .... 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 .... 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 .... 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 .... 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 .... 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 .... 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 .... 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 .... 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 .... 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 .... 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 .... 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 .... 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 |
irregex-lengths irregex-names irregex-num-submatches irregex-extract irregex-split sre->cset) (import (srfi s0 cond-expand) (except (rnrs) error find filter remove) (rnrs r5rs) (rnrs mutable-pairs) (rnrs mutable-strings) (only (chezscheme) include)) ;; definition from irregex (define (error msg . args) (display msg) (for-each (lambda (x) (display " ") (write x)) args) (newline) (0)) ;;;; irregex.scm -- IrRegular Expressions ;; ;; Copyright (c) 2005-2024 Alex Shinn. All rights reserved. ;; BSD-style license: http://synthcode.com/license.txt ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; At this moment there was a loud ring at the bell, and I could ;; hear Mrs. Hudson, our landlady, raising her voice in a wail of ;; expostulation and dismay. ;; ................................................................................ ;; "No, it's not quite so bad as that. It is the unofficial ;; force, -- the Baker Street irregulars." ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Notes ;; ;; This code should not require any porting - it should work out of ;; the box in any R[457]RS Scheme implementation. Slight modifications ;; are needed for R6RS (a separate R6RS-compatible version is included ;; in the distribution as irregex-r6rs.scm). ;; ;; The goal of portability makes this code a little clumsy and ;; inefficient. Future versions will include both cleanup and ;; performance tuning, but you can only go so far while staying ;; portable. AND-LET*, SRFI-9 records and custom macros would've been ;; nice. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History ;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf. ;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs ;; in alternations, and bol in folds for backtracking ;; matcher (thanks John Clements and snan for reporting ;; and Peter Bex for fixing) ;; 0.9.9: 2021/05/14 - more comprehensive fix for repeated empty matches ;; 0.9.8: 2020/07/13 - fix irregex-replace/all with look-behind patterns ;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold, ;; -replace and -split ;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation ;; of backtracking matcher (CVE-2016-9954). ;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow ;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches ;; 0.9.3: 2014/07/01 - R7RS library ;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns ;; 0.9.1: 2012/11/27 - various accumulated bugfixes ;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex. ;; 0.8.3: 2011/12/18 - various accumulated bugfixes ;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate ................................................................................ ;; For look-behind searches, wrap an existing chunker such that it ;; returns the same results but ends at a given point. (define (wrap-end-chunker cnk src i) (make-irregex-chunker (lambda (x) (and (not (eq? x src)) ((chunker-get-next cnk) x))) (chunker-get-str cnk) (chunker-get-start cnk) (lambda (x) ;; TODO: this is a hack workaround for the fact that we don't ;; have either a notion of chunk equivalence or chunk truncation, ;; until which time (neg-)look-behind in a fold won't work on ;; non-basic chunks. (if (or (eq? x src) (and (not ((chunker-get-next cnk) x)) (not ((chunker-get-next cnk) src)))) i ((chunker-get-end cnk) x))) (chunker-get-substring cnk) (chunker-get-subchunk cnk))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; String Utilities ;; Unicode version (skip surrogates) ................................................................................ `(/ ,(integer->char 0) ,(integer->char #xD7FF) ,(integer->char #xE000) ,(integer->char #x10FFFF))) ;; ASCII version, offset to not assume 0-255 ;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) ;; set to #f to ignore even an explicit request for utf8 handling ;; The utf8-mode is undesired on any implementation with native unicode support. ;; It is a workaround for those that treat strings as a raw byte sequences, and ;; does not work well otherwise. So disable it on implementations known to ;; handle unicode natively. (define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t) (else #f))) ;; (define *named-char-properties* '()) (define (string-scan-char str c . o) (let ((end (string-length str))) (let scan ((i (if (pair? o) (car o) 0))) (cond ((= i end) #f) ................................................................................ (j (if at? (+ i 2) (+ i 1)))) (read j (lambda (sexp j) (k (list u sexp) j))))) ((#\") (let scan ((from (+ i 1)) (i (+ i 1)) (res '())) (define (collect) (if (= from i) res (cons (substring str from i) res))) (if (>= i end) (error "unterminated string in embedded SRE" str) (case (string-ref str i) ((#\") (k (string-cat-reverse (collect)) (+ i 1))) ((#\\) (scan (+ i 1) (+ i 2) (collect))) (else (scan from (+ i 1) res)))))) ((#\#) (case (string-ref str (+ i 1)) ((#\;) ................................................................................ (error "unterminated (*'...) SRE escape" str) (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) (error "bad regexp syntax: (*FOO) not supported" str))) (else ;; (?...) case (case (string-ref str (+ i 2)) ((#\#) (let ((j (string-scan-char str #\) (+ i 3)))) (if (not j) (error "missing ) after #" str) (lp (+ j i) (+ j 1) flags (collect) st)))) ((#\:) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) ((#\=) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(look-ahead) (save))) ((#\!) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) ................................................................................ ;;((#\R) ; recursion ;; ) ((#\() (cond ((>= (+ i 3) end) (error "unterminated parenthesis in regexp" str)) ((char-numeric? (string-ref str (+ i 3))) (let ((j (string-scan-char str #\) (+ i 3)))) (if (not j) (error "missing closing parenthesis" str i) (let ((n (string->number (substring str (+ i 3) j)))) (if (not n) (error "invalid conditional reference" str) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,n if) (save))))))) ((char-alphabetic? (string-ref str (+ i 3))) (let ((j (string-scan-char str #\) (+ i 3)))) (if (not j) (error "missing closing parenthesis" str i) (let ((s (string->symbol (substring str (+ i 3) j)))) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,s if) (save)))))) (else (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) '(if) (save))))) ((#\{) (error "unsupported Perl-style cluster" str)) (else (let ((old-flags flags)) ................................................................................ (let ((res (collect/single))) (cond ((null? res) (error "{ can't follow empty pattern")) (else (let* ((x (car res)) (tail (cdr res)) (j (or (string-scan-char str #\} (+ i 1)) (error "missing closing }" str i))) (s2 (string-split-char (substring str (+ i 1) j) #\,)) (n (string->number (car s2))) (m (and (pair? (cdr s2)) (string->number (cadr s2))))) (cond ((or (not n) ................................................................................ (let ((lo-ls (char->utf8-list lo)) (hi-ls (char->utf8-list hi))) (if (not (= (length lo-ls) (length hi-ls))) (sre-alternate (list (unicode-range-climb-digits lo-ls hi-ls) (unicode-range-up-to hi-ls))) (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) (cond ((= (car lo-ls) (car hi-ls)) (sre-sequence (cons (integer->char (car lo-ls)) (if (null? (cdr lo-ls)) '() (cons (lp (cdr lo-ls) (cdr hi-ls)) '()))))) ((= (+ (car lo-ls) 1) (car hi-ls)) (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-up-to hi-ls)))) (else (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-middle lo-ls hi-ls) (unicode-range-up-to hi-ls))))))))) ................................................................................ ((any) '(* any)) ((nonl) '(* nonl)) (else (cons '* (map rec (cdr sre)))))) (else (cons (car sre) (map rec (cdr sre)))))) (else (case sre ((any) (if utf8? 'utf8-any 'any)) ((nonl) (if utf8? 'utf8-nonl 'nonl)) (else (if (and utf8? (char? sre) (high-char? sre)) (sre-sequence (map integer->char (char->utf8-list sre))) sre))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Compilation ................................................................................ (and (pair? sre) (or (memq (car sre) '(* +)) (and (memq (car sre) '($ submatch => submatch-named seq :)) (pair? (cdr sre)) (null? (cddr sre)) (sre-repeater? (cadr sre)))))) (define (sre-bos? sre) (if (pair? sre) (case (car sre) ((seq : $ submatch => submatch-named) (and (pair? (cdr sre)) (sre-bos? (cadr sre)))) ((or) (every sre-bos? (cdr sre))) (else #f)) (eq? 'bos sre))) ;; a searcher doesn't need explicit iteration to find the first match (define (sre-searcher? sre) (or (sre-bos? sre) (and (pair? sre) (case (car sre) ((* +) (sre-any? (sre-sequence (cdr sre)))) ((seq : $ submatch => submatch-named) (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) ((or) (every sre-searcher? (cdr sre))) (else #f))))) ;; a consumer doesn't need to match more than once (define (sre-consumer? sre) (or (sre-bos? sre) (and (pair? sre) (case (car sre) ((* +) (sre-any? (sre-sequence (cdr sre)))) ((seq : $ submatch => submatch-named) (and (pair? (cdr sre)) (sre-consumer? (last sre)))) ((or) (every sre-consumer? (cdr sre))) (else #f))))) (define (sre-has-submatches? sre) (and (pair? sre) (or (memq (car sre) '($ submatch => submatch-named)) (if (eq? 'posix-string (car sre)) (sre-has-submatches? (string->sre (cadr sre))) (any sre-has-submatches? (cdr sre)))))) ................................................................................ (cond ((and finalizer (not new-finalizer)) (finalize! finalizer memory matches))) (for-each (lambda (s) (let ((slot (vector-ref memory (cdr s))) (chunk&position (cons src (+ i 1)))) (vector-set! slot (car s) chunk&position))) (cdr cmds)) ;; Reassigning commands may be in an order which ;; causes memory cells to be clobbered before ;; they're read out. Make 2 passes to maintain ;; old values by copying them into a closure. (for-each (lambda (execute!) (execute!)) (map (lambda (c) (let* ((tag (vector-ref c 0)) (ss (vector-ref memory (vector-ref c 1))) (ds (vector-ref memory (vector-ref c 2))) (value-from (vector-ref ss tag))) (lambda () (vector-set! ds tag value-from)))) (car cmds)))))) (if new-finalizer (lp2 (+ i 1) next src (+ i 1) new-finalizer) (lp2 (+ i 1) next res-src res-index #f)))) (res-src (cond (index (irregex-match-end-chunk-set! matches index res-src) ................................................................................ (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) (punct . punctuation) (graphic . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) (graph . graphic) (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) ;; 0B - vertical tab, 0C - form feed (whitespace . (or blank #\newline #\x0C #\return #\x0B)) (space . whitespace) (white . whitespace) (printing . (or graphic whitespace)) (print . printing) ;; XXXX we assume a (possibly shifted) ASCII-based ordering (control . (/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (- (char->integer #\space) 1)))) (cntrl . control) (hex-digit . (or numeric (/ #\a #\f #\A #\F))) ................................................................................ (ipv6-address . (seq (** 0 4 hex-digit) (** 1 7 #\: (? #\:) (** 0 4 hex-digit)))) (ip-address . (or ipv4-address ipv6-address)) (domain-atom . (+ (or alphanumeric #\_ #\-))) (domain . (seq domain-atom (+ #\. domain-atom))) ;; XXXX now anything can be a top-level domain, but this is still handy (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org" "edu" "aero" "biz" "coop" "info" "museum" "name" "pro" (= 2 alpha)))) (domain/common . (seq (+ domain-atom #\.) top-level-domain)) ;;(email-local-part . (seq (+ (or (~ #\") string)))) (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+))) (email . (seq email-local-part #\@ domain)) (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\; (seq "%" hex-digit hex-digit))) (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\& (seq "%" hex-digit hex-digit))) (http-url . (w/nocase "http" (? "s") "://" (or domain ipv4-address) ;; (seq "[" ipv6-address "]") (? ":" (+ numeric)) ;; port ;; path (? "/" (* (or url-char "/")) (? "?" (* url-char)) ;; query (? "#" (? (* url-char) url-final-char)) ;; fragment ))) ................................................................................ next)) (a (and b (lp (list (cadar ls)) (new-state-number (max b next)) flags next)))) (and a (let ((c (add-state! (new-state-number (max a b)) '()))) (nfa-add-epsilon! buf c a #f) (nfa-add-epsilon! buf c b #f) c))))))) ((?) (let ((next (lp (cdr ls) n flags next))) ;; insert an epsilon transition directly to next ................................................................................ ;; closure, but has different tag value mappings. ;; If found, calculate reordering commands so we can map the closure ;; to that state instead of adding a new DFA state. ;; This is completely handwaved away in Laurikari's paper (it basically ;; says "insert reordering algorithm here"), so this code was constructed ;; after some experimentation. In other words, bugs be here. (define (find-reorder-commands-internal nfa closure dfa-states) (let ((num-tags (nfa-num-tags nfa)) (closure-summary (mst-mappings-summary closure))) (let lp ((dfa-states dfa-states)) (if (null? dfa-states) #f (if (not (mst-same-states? (caar dfa-states) closure)) (lp (cdr dfa-states)) (let lp2 ((state-summary (mst-mappings-summary (caar dfa-states))) ................................................................................ (next cnk init src str i end matches (lambda () (body cnk init src str i end matches fail)))))) ((*) (cond ((sre-empty? (sre-sequence (cdr sre))) (error "invalid sre: empty *" sre)) (else (let ((body (rec (list '+ (sre-sequence (cdr sre)))))) (lambda (cnk init src str i end matches fail) (body cnk init src str i end matches (lambda () (next cnk init src str i end matches fail)))))))) ((*?) (cond ((sre-empty? (sre-sequence (cdr sre))) ................................................................................ (body cnk init src str i end matches fail) )))))) (lambda (cnk init src str i end matches fail) (next cnk init src str i end matches (lambda () (body cnk init src str i end matches fail)))))))) ((+) (cond ((sre-empty? (sre-sequence (cdr sre))) (error "invalid sre: empty +" sre)) (else (letrec ((body (lp (sre-sequence (cdr sre)) n flags (lambda (cnk init src str i end matches fail) (body cnk init src str i end matches (lambda () (next cnk init src str i end matches fail) )))))) body)))) ((=) (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) ((>=) (rec `(** ,(cadr sre) #f ,@(cddr sre)))) ((**) (cond ((or (and (number? (cadr sre)) ................................................................................ ((bos) (lambda (cnk init src str i end matches fail) (if (and (eq? src (car init)) (eqv? i (cdr init))) (next cnk init src str i end matches fail) (fail)))) ((bol) (lambda (cnk init src str i end matches fail) (if (let ((ch (if (> i ((chunker-get-start cnk) src)) (string-ref str (- i 1)) (chunker-prev-char cnk init src)))) (or (not ch) (eqv? #\newline ch))) (next cnk init src str i end matches fail) (fail)))) ((bow) (lambda (cnk init src str i end matches fail) (if (and (if (> i ((chunker-get-start cnk) src)) (not (char-alphanumeric? (string-ref str (- i 1)))) (let ((ch (chunker-prev-char cnk init src))) (or (not ch) (not (char-alphanumeric? ch))))) (if (< i end) (char-alphanumeric? (string-ref str i)) (let ((next ((chunker-get-next cnk) src))) (and next (char-alphanumeric? (string-ref ((chunker-get-str cnk) next) ((chunker-get-start cnk) next))))))) ................................................................................ (init-src (list str start end)) (init (cons init-src start))) (if (not (and (integer? start) (exact? start))) (error "irregex-fold: not an exact integer" start)) (if (not (and (integer? end) (exact? end))) (error "irregex-fold: not an exact integer" end)) (irregex-match-chunker-set! matches irregex-basic-string-chunker) (let lp ((src init-src) (from start) (i start) (acc knil)) (if (>= i end) (finish from acc) (let ((m (irregex-search/matches irx irregex-basic-string-chunker init src i matches))) (if (not m) (finish from acc) (let ((j-start (%irregex-match-start-index m 0)) (j (%irregex-match-end-index m 0)) (acc (kons from m acc))) (irregex-reset-matches! matches) (cond ((flag-set? (irregex-flags irx) ~consumer?) (finish j acc)) ((= j j-start) ;; skip one char forward if we match the empty string (lp (list str j end) j (+ j 1) acc)) (else (lp (list str j end) j j acc)))))))))) (define (irregex-fold irx kons . args) (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons)) (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) (apply irregex-fold/fast irx kons2 args))) (define (irregex-fold/chunked/fast irx kons knil cnk start . o) ................................................................................ (if (and (eq? end-src start) (= end-index i)) (if (>= end-index ((chunker-get-end cnk) end-src )) (let ((next ((chunker-get-next cnk) end-src))) (lp next ((chunker-get-start cnk) next) acc)) (lp end-src (+ end-index 1) acc)) (let ((acc (kons start i m acc))) (irregex-reset-matches! matches) (if (flag-set? (irregex-flags irx) ~consumer?) (finish end-src end-index acc) (lp end-src end-index acc))))))))))) (define (irregex-fold/chunked irx kons . args) (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons)) (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) (apply irregex-fold/chunked/fast irx kons2 args))) ................................................................................ (define (irregex-replace/all irx str . o) (if (not (string? str)) (error "irregex-replace/all: not a string" str)) (irregex-fold/fast irx (lambda (i m acc) (let ((m-start (%irregex-match-start-index m 0))) (if (>= i m-start) (append (irregex-apply-match m o) acc) (append (irregex-apply-match m o) (cons (substring str i m-start) acc))))) '() str (lambda (i acc) (let ((end (string-length str))) (string-cat-reverse (if (>= i end) acc ................................................................................ (define (irregex-split irx str . o) (if (not (string? str)) (error "irregex-split: not a string" str)) (let ((start (if (pair? o) (car o) 0)) (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) (irregex-fold/fast irx (lambda (i m a) (cond ((= i (%irregex-match-start-index m 0)) a) (else (cons (substring str i (%irregex-match-start-index m 0)) a)))) '() str (lambda (i a) (let lp ((ls (if (= i end) a (cons (substring str i end) a))) (res '()) (was-char? #f)) (cond ((null? ls) res) ((char? (car ls)) (lp (cdr ls) (if (or was-char? (null? res)) (cons (string (car ls)) res) (cons (string-append (string (car ls)) (car res)) (cdr res))) #t)) (else (lp (cdr ls) (cons (car ls) res) #f))))) start end))) ) |