pcre.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker and Peter Ludemann
    4    E-mail:        jan@swi-prolog.org
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017-2022, VU University Amsterdam
    7                              SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(pcre,
   37          [ re_match/2,           % +Regex, +String
   38            re_match/3,           % +Regex, +String, +Options
   39            re_matchsub/3,        % +Regex, +String, -Subs
   40            re_matchsub/4,        % +Regex, +String, -Subs, +Options
   41            re_foldl/6,           % :Goal, +Regex, +String, ?V0, ?V, +Options
   42            re_split/3,           % +Pattern, +String, -Split:list
   43            re_split/4,           % +Pattern, +String, -Split:list, +Options
   44            re_replace/4,         % +Pattern, +With, +String, -NewString
   45            re_replace/5,         % +Pattern, +With, +String, -NewString, +Options
   46            re_compile/3,         % +Pattern, -Regex, +Options
   47            re_flush/0,
   48            re_config/1           % ?Config
   49          ]).   50:- autoload(library(apply), [maplist/2, maplist/3]).   51:- autoload(library(error), [must_be/2, existence_error/2]).   52:- autoload(library(dcg/basics), [eos/2, digit/3, digits/3]).   53:- autoload(library(lists), [append/3]).   54
   55:- use_foreign_library(foreign(pcre4pl)).   56
   57:- meta_predicate
   58    re_foldl(3, +, +, ?, ?, +).   59
   60/** <module> Perl compatible regular expression matching for SWI-Prolog
   61
   62This module provides an interface  to  the [PCRE2](http://www.pcre.org/)
   63(Perl Compatible Regular Expression)  library.   This  Prolog  interface
   64provides an almost comprehensive wrapper around PCRE2  (the successor to
   65PCRE)  with as much backward compatibility to PCRE as possible,  because
   66the original implementation was for PCRE (also known as PCRE1).
   67
   68Regular  expressions  are  created  from   a  pattern  and  options  and
   69represented as  a SWI-Prolog _blob_.   This implies they are  subject to
   70(atom) garbage  collection. Compiled  regular expressions can  safely be
   71used  in multiple  threads. Most  predicates accept  both an  explicitly
   72compiled regular  expression, a pattern,  or a term  Pattern/Flags.  The
   73semantics of the pattern can be additionally modified by options. In the
   74latter two cases a regular expression  _blob_ is created and stored in a
   75cache. The cache can be cleared using re_flush/0.
   76
   77@see `man pcre2api` or https://www.pcre.org/current/doc/html/pcre2api.html
   78     for details of the PCRE2 syntax and options.
   79*/
   80
   81:- predicate_options(re_match/3, 3,
   82                     [ start(integer), % Not part of pcre2 API
   83                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
   84                       anchored(boolean),    % Also re_compile/3
   85                       utf_check(boolean),   % Also re_compile/3
   86                       endanchored(boolean), % Also re_compile/3
   87                       bol(boolean),
   88                       eol(boolean),
   89                       empty(boolean),
   90                       empty_atstart(boolean),
   91                       partial_soft(bool),
   92                       partial_hard(bool),
   93                       % dfa_restart(bool),  % TODO: if pcre2_dfa_match() is supported
   94                       % dfa_shortest(bool), % TODO: if pcre2_dfa_match() is supported
   95                       jit(boolean),
   96                       copy_matched_subject(boolean)
   97                     ]).   98:- predicate_options(re_compile/3, 3,
   99                     [ capture_type(oneof([atom,string,range])), % Not part of pcre2 API
  100                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
  101                       anchored(boolean),    % Also re_match/3
  102                       utf_check(boolean),   % Also re_match/3
  103                       endanchored(boolean), % Also re_match/3
  104                       allow_empty_class(boolean),
  105                       alt_bsux(boolean),
  106                       auto_callout(boolean),
  107                       caseless(boolean),
  108                       dollar_endonly(boolean),
  109                       dotall(boolean),
  110                       dupnames(boolean),
  111                       extended(boolean),
  112                       firstline(boolean),
  113                       match_unset_backref(boolean),
  114                       multiline(boolean),
  115                       never_ucp(boolean),
  116                       never_utf(boolean),
  117                       auto_capture(boolean),
  118                       no_auto_capture(boolean), % backwards compatibility
  119                       auto_possess(boolean),
  120                       dotstar_anchor(boolean),
  121                       start_optimize(boolean),
  122                       ucp(boolean),
  123                       greedy(boolean),
  124                       ungreedy(boolean), % Backwards compatibility
  125                       utf(boolean),
  126                       never_backslash_c(boolean),
  127                       alt_circumflex(boolean),
  128                       alt_verbnames(boolean),
  129                       use_offset_limit(boolean),
  130                       extended_more(boolean),
  131                       literal(boolean),
  132                       match_invalid_utf(boolean),
  133                       jit_complete(boolean),
  134                       jit_partial_soft(boolean),
  135                       jit_partial_hard(boolean),
  136                       jit_invalid_utf(boolean),
  137                       bsr(oneof([anycrlf,unicode])),
  138                       bsr2(oneof([anycrlf,unicode])),
  139                       compat(oneof([])), % Obsolete
  140                       newline(oneof([any,anycrlf,cr,lf,crlf,nul])),
  141                       newline2(oneof([any,anycrlf,cr,lf,crlf,nul]))
  142                     ]).  143:- predicate_options(re_matchsub/4, 4,
  144                     [ pass_to(re_match/3, 3)
  145                     ]).  146:- predicate_options(re_foldl/6, 6,
  147                     [ pass_to(re_match/3, 3)
  148                     ]).  149:- predicate_options(re_split/4, 4,
  150                     [ pass_to(re_match/3, 3)
  151                     ]).  152:- predicate_options(re_replace/5, 5,
  153                     [ pass_to(re_match/3, 3)
  154                     ]).  155
  156%!  re_match(+Regex, +String) is semidet.
  157%!  re_match(+Regex, +String, +Options) is semidet.
  158%
  159%   Succeeds if String matches Regex.  For example:
  160%
  161%     ```
  162%     ?- re_match("^needle"/i, "Needle in a haystack").
  163%     true.
  164%     ```
  165%
  166%   Defined  Options  are  given  below.   For  details,  see  the  PCRE
  167%   documentation.  If  an option is  repeated, the first value  is used
  168%   and  subsequent  values  are   ignored.   Unrecognized  options  are
  169%   ignored.   Unless otherwise  specified, boolean  options default  to
  170%   `false`.
  171%
  172%   If Regex is a text pattern  (optionally with flags), then any of the
  173%   Options for  re_compile/3 can  be used, in  addition to  the Options
  174%   listed below. If Regex is the  result of re_compile/3, then only the
  175%   following execution-time  Options are recognized and  any others are
  176%   ignored. Some options may not exist on your system, depending on the
  177%   PCRE2 version and  how it was built - these  unsupported options are
  178%   silently ignored.
  179%
  180%     * start(From)
  181%     Start at the given character index
  182%     * anchored(Bool)
  183%     If `true`, match only at the first position
  184%     * bol(Bool)
  185%     String is the beginning of a line (default `true`) -
  186%       affects behavior of circumflex metacharacter (`^`).
  187%     * empty(Bool)
  188%     An empty string is a valid match (default `true`)
  189%     * empty_atstart(Bool)
  190%     An empty string at the start of the subject is a valid match
  191%     (default `true`)
  192%     * eol(Bool)
  193%     String is the end of a line -
  194%       affects behavior of dollar metacharacter (`$`)
  195%       (default `true`).
  196%     * newline(Mode)
  197%     If `any`, recognize any Unicode newline sequence,
  198%     if `anycrlf`, recognize CR, LF, and CRLF as newline
  199%     sequences, if `cr`, recognize CR, if `lf`, recognize
  200%     LF, if `crlf` recognize CRLF as newline.
  201%     The default is determined by how PCRE was built, and
  202%     can be found by re_config(newline2(NewlineDefault)).
  203%     * newline2(Mode) - synonym for newline(Mode).
  204%     * utf_check(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  205%     You should not need this because SWI-Prolog ensures that the UTF8 strings are valid,
  206%     so the default is `false`.
  207%     * endanchored(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  208%     * partial_soft(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  209%     * partial_hard(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  210%     * dfa_restart(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  211%     * dfa_shortest(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  212%
  213%   @arg  Regex is  the  output of  re_compile/3, a  pattern  or a  term
  214%   Pattern/Flags, where Pattern is an atom or string. The defined flags
  215%   and their related option for re_compile/3 are below.
  216%     - *x*: extended(true)
  217%     - *i*: caseless(true)
  218%     - *m*: multiline(true)
  219%     - *s*: dotall(true)
  220%     - *a*: capture_type(atom)
  221%     - *r*: capture_type(range)
  222%     - *t*: capture_type(term)
  223%
  224%   If Regex is the output  of re_compile/3, any compile-time options in
  225%   Options or Flags are ignored and only match-time options are used.
  226%
  227%   The options  that are  derived from flags  take precedence  over the
  228%   options in the  Options list. In the case of  conflicting flags, the
  229%   first one is used (e.g., `ra` results in `capture_type(range)`).
  230
  231re_match(Regex, String) :-
  232    re_match(Regex, String, []).
  233re_match(Regex, String, Options) :-
  234    re_compiled(Regex, Compiled, Options),
  235    re_match_(Compiled, String, Options).
  236
  237%!  re_matchsub(+Regex, +String, -Sub:dict) is semidet.
  238%!  re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet.
  239%
  240%   Match String  against Regex.  On  success, Sub is a  dict containing
  241%   integer keys  for the numbered capture  group and atom keys  for the
  242%   named capture groups. The entire match  string has the key `0`.  The
  243%   associated  value is  determined  by  the capture_type(Type)  option
  244%   passed  to  re_compile/3, or  by  flags  if  Regex  is of  the  form
  245%   Pattern/Flags;  and may  be  specified at  the  level of  individual
  246%   captures  using  a  naming  convention for  the  caption  name.  See
  247%   re_compile/3 for details.
  248%
  249%   The  example  below  exploits  the  typed groups  to  parse  a  date
  250%   specification:
  251%
  252%     ```
  253%     ?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
  254%                     (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
  255%                    "2017-04-20", Sub, []).
  256%     Sub = re_match{0:"2017-04-20", date:"2017-04-20",
  257%                    day:20, month:4, year:2017}.
  258%
  259%     ```
  260%
  261%   @arg Both compilation and execution options are processed.  See
  262%   re_compile/3 and re_match/3 for the set of options. In addition,
  263%   some compilation options may passed as ``/Flags`` to Regex - see
  264%   re_match/3 for the list of flags.
  265%
  266%   @arg Regex  See re_match/2 for a description of this argument.
  267
  268re_matchsub(Regex, String, Subs) :-
  269    re_matchsub(Regex, String, Subs, []).
  270
  271re_matchsub(Regex, String, Subs, Options) :-
  272    re_compiled(Regex, Compiled, Options),
  273    re_matchsub_(Compiled, String, Pairs, Options),
  274    dict_pairs(Subs, re_match, Pairs).
  275
  276%!  re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet.
  277%
  278%   Fold all matches of Regex on String.  Each match is represented by a
  279%   dict as specified  for re_matchsub/4.  V0 and V are  related using a
  280%   sequence of invocations of Goal as illustrated below.
  281%
  282%       ```
  283%       call(Goal, Dict1, V0, V1),
  284%       call(Goal, Dict2, V1, V2),
  285%       ...
  286%       call(Goal, Dictn, Vn, V).
  287%       ```
  288%
  289%   This predicate is used to implement re_split/4 and re_replace/4. For
  290%   example, we  can count all matches  of a Regex on  String using this
  291%   code:
  292%
  293%     ```
  294%     re_match_count(Regex, String, Count) :-
  295%         re_foldl(increment, Regex, String, 0, Count, []).
  296%
  297%     increment(_Match, V0, V1) :-
  298%         V1 is V0+1.
  299%     ```
  300%
  301%   After which we can query
  302%
  303%     ```
  304%     ?- re_match_count("a", "aap", X).
  305%     X = 2.
  306%     ```
  307%
  308%  Here is an example Goal for extracting all the matches with their
  309%  offsets within the string:
  310%
  311%  ```
  312%  range_match(Dict, StringIndex-[MatchStart-Substring|List], StringIndex-List) :-
  313%      Dict.(StringIndex.index) = MatchStart-MatchLen,
  314%      sub_string(StringIndex.string, MatchStart, MatchLen, _, Substring).
  315%  ```
  316%  And can be used with this query (note the capture_type(range) option,
  317%  which is needed by `range_match/3`, and greedy(false) to invert the
  318%  meaning of `*?`):
  319%  ```
  320%  ?- String = "{START} Mary {END} had a {START} little lamb {END}",
  321%     re_foldl(range_match,
  322%              "{START} *?(?<piece>.*) *?{END}",
  323%              String, _{string:String,index:piece}-Matches, _-[],
  324%              [capture_type(range),greedy(false)]).
  325%  Matches = [8-"Mary", 33-"little lamb"].
  326%  ```
  327re_foldl(Goal, Regex, String, V0, V, Options) :-
  328    re_compiled(Regex, Compiled, Options),
  329    re_foldl_(Compiled, String, Goal, V0, V, Options).
  330
  331:- public re_call_folder/4. % prevent code obfusication name mangling
  332:- meta_predicate re_call_folder(2, +, ?, ?).  333
  334%   re_call_folder(:Goal, +Pairs, ?V0, ?V1).
  335%   Used by re_foldl_/6 to call Goal with a dict.
  336%     DO NOT use "%!" comment - that would add it to the docs
  337re_call_folder(Goal, Pairs, V0, V1) :-
  338    dict_pairs(Dict, re_match, Pairs),
  339    call(Goal, Dict, V0, V1).
  340
  341
  342%!  re_split(+Pattern, +String, -Splits:list) is det.
  343%!  re_split(+Pattern, +String, -Splits:list, +Options) is det.
  344%
  345%   Split String using the regular  expression Pattern. Splits is a list
  346%   of strings holding alternating matches  of Pattern and skipped parts
  347%   of the String, starting with a  skipped part.  The Splits lists ends
  348%   with a  string of  the content  of String after  the last  match. If
  349%   Pattern does not  appear in String, Splits is a  list holding a copy
  350%   of String. This implies the number of elements in Splits is _always_
  351%   odd.  For example:
  352%
  353%     ```
  354%     ?- re_split("a+", "abaac", Splits, []).
  355%     Splits = ["","a","b","aa","c"].
  356%     ?- re_split(":\\s*"/n, "Age: 33", Splits, []).
  357%     Splits = ['Age', ': ', 33].
  358%     ```
  359%
  360%   @arg  Pattern is  the pattern  text, optionally  follows by  /Flags.
  361%   Similar to re_matchsub/4, the final output type can be controlled by
  362%   a flag `a` (atom), `s` (string, default) or `n` (number if possible,
  363%   atom otherwise).
  364
  365re_split(Pattern, String, Splits) :-
  366    re_split(Pattern, String, Splits, []).
  367re_split(Pattern, String, Splits, Options) :-
  368    split_range_regex(Pattern, Compiled, Type, Options),
  369    State = state(String, 0, Type),
  370    re_foldl(split(State), Compiled, String, Splits, [Last], Options),
  371    arg(2, State, LastSkipStart),
  372    typed_sub(Type, String, LastSkipStart, _, 0, Last).
  373
  374split_range_regex(Pattern/Flags, Compiled, Type, Options) =>
  375    split_range_regex(Pattern, Flags, Compiled, Type, Options).
  376split_range_regex(Pattern, Compiled, Type, Options) =>
  377    split_range_regex(Pattern, '', Compiled, Type, Options).
  378
  379split_range_regex(Pattern, Flags, Compiled, Type, Options) =>
  380    regex_capture_type_flag_chars(Flags, Chars, Options),
  381    split_flags(Chars, Chars1, Type),
  382    atom_chars(RFlags, [r|Chars1]),
  383    re_flags_options(RFlags, ROptions),
  384    append(ROptions, Options, Options2),
  385    re_compiled(Pattern/RFlags, Compiled, Options2).
  386
  387split_flags([], [], Type) :-
  388    default(Type, string).
  389split_flags([H|T0], T, Type) :-
  390    split_type(H, Type),
  391    !,
  392    split_flags(T0, T, Type).
  393split_flags([H|T0], [H|T], Type) :-
  394    split_flags(T0, T, Type).
  395
  396split_type(a, atom).
  397split_type(s, string).
  398split_type(n, name).
  399
  400split(State, Dict, [Skipped,Sep|T], T) :-
  401    matched(State, Dict.0, Sep),
  402    skipped(State, Dict.0, Skipped).
  403
  404matched(state(String, _, Type), Start-Len, Matched) :-
  405    typed_sub(Type, String, Start, Len, _, Matched).
  406
  407skipped(State, Start-Len, Skipped) :-
  408    State = state(String, Here, Type),
  409    SkipLen is Start-Here,
  410    typed_sub(Type, String, Here, SkipLen, _, Skipped),
  411    NextSkipStart is Start+Len,
  412    nb_setarg(2, State, NextSkipStart).
  413
  414typed_sub(string, Haystack, B, L, A, String) :-
  415    sub_string(Haystack, B, L, A, String).
  416typed_sub(atom, Haystack, B, L, A, String) :-
  417    sub_atom(Haystack, B, L, A, String).
  418typed_sub(name, Haystack, B, L, A, Value) :-
  419    sub_string(Haystack, B, L, A, String),
  420    (   number_string(Number, String)
  421    ->  Value = Number
  422    ;   atom_string(Value, String)
  423    ).
  424
  425%!  re_replace(+Pattern, +With, +String, -NewString) is det.
  426%!  re_replace(+Pattern, +With, +String, -NewString, +Options) is det.
  427%
  428%   Replace matches  of the  regular expression  Pattern in  String with
  429%   With (possibly containing references to captured substrings).
  430%
  431%   Throws  an error  if With  uses  a name  that doesn't  exist in  the
  432%   Pattern.
  433%
  434%   @arg Pattern  is the  pattern text,  optionally followed  by /Flags.
  435%   Flags  may include  `g`, replacing  all occurences  of Pattern.   In
  436%   addition, similar  to re_matchsub/4,  the final  output type  can be
  437%   controlled  by a  flag `a`  (atom)  or `s`  (string, default).   The
  438%   output  type can  also be  specified by  the `capture_type`  option.
  439%   Capture  type  suffixes  can   modify  behavior;  for  example,  the
  440%   following  will  change an  ISO  8601  format date  (YYYY-MM-DD)  to
  441%   American style (m/d/y),  and also remove leading zeros  by using the
  442%   `_I` suffix:
  443%
  444%   ```
  445%   re_replace("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
  446%               (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
  447%              "$month-$day-$year",
  448%              ISODate, AmericanDate)`
  449%   ```
  450%
  451%   @arg  With  is  the  replacement text.  It  may  reference  captured
  452%   substrings using \N or $Name. Both N  and Name may be written as {N}
  453%   and {Name} to avoid ambiguities. If  a substring is named, it cannot
  454%   be referenced by its number. The single chracters `$` and `\` can be
  455%   escaped  by  doubling  (e.g.,  `re_replace(".","$$","abc",Replaced)`
  456%   results in  `Replaced="$bc"`). (Because  `\` is an  escape character
  457%   inside strings, you need to write "\\\\" to get a single backslash.)
  458%
  459%   @arg Options See re_match/3 for the set of options.
  460%
  461%   The options  that are  derived from flags  take precedence  over the
  462%   options in the  Options list. In the case of  conflicting flags, the
  463%   first one  is used  (e.g., `as` results  in `capture_type(string)`).
  464%   If  a  `capture_type` is  meaningless  (`range`  or `term`),  it  is
  465%   ignored.
  466
  467re_replace(Pattern, With, String, NewString) :-
  468    re_replace(Pattern, With, String, NewString, []).
  469
  470re_replace(Pattern, With, String, NewString, Options) :-
  471    replace_range_regex(Pattern, Compiled, All, Type, Options),
  472    compile_replacement(With, RCompiled),
  473    State = state(String, 0, Type),
  474    (   All == all
  475    ->  re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], [])
  476    ;   (   re_matchsub(Compiled, String, Match, [])
  477        ->  replace(State, RCompiled, Match, Parts, [Last])
  478        ;   Repl = false
  479        )
  480    ),
  481    (   Repl == false
  482    ->  parts_to_output(Type, [String], NewString)
  483    ;   arg(2, State, LastSkipStart),
  484        sub_string(String, LastSkipStart, _, 0, Last),
  485        parts_to_output(Type, Parts, NewString)
  486    ).
  487
  488regex_capture_type_flag_chars(Flags, Chars, Options) :-
  489    atom_chars(Flags, Chars0),
  490    % For replace or split, the capture_type must be range, so if a
  491    % different result is desired, it is specified in the flags. The
  492    % following code converts an Options capture_type to a flag
  493    % character and appends it to the Flags.
  494    (   memberchk(capture_type(T), Options),
  495        type_flag(TFlag, T)
  496    ->  % No need to do delete(Options,capture_type(_),Options2)
  497        % because Flags take precedence and first occurence in Options
  498        % takes precedence.
  499        append(Chars0, [TFlag], Chars)
  500    ;   Chars = Chars0
  501    ).
  502
  503%! replace_range_regex(+Pattern, -Compiled, -All, -Type, +Options) is det.
  504replace_range_regex(Pattern/Flags, Compiled, All, Type, Options) =>
  505    replace_range_regex(Pattern, Flags, Compiled, All, Type, Options).
  506replace_range_regex(Pattern, Compiled, All, Type, Options) =>
  507    replace_range_regex(Pattern, '', Compiled, All, Type, Options).
  508
  509replace_range_regex(Pattern, Flags, Compiled, All, Type, Options) =>
  510    regex_capture_type_flag_chars(Flags, Chars, Options),
  511    replace_flags(Chars, Chars1, All, Type),
  512    atom_chars(RFlags, [r|Chars1]),
  513    re_flags_options(RFlags, ROptions),
  514    append(ROptions, Options, Options2),
  515    re_compiled(Pattern, Compiled, Options2).
  516
  517replace_flags([], [], All, Type) :-
  518    default(All, first),
  519    default(Type, string).
  520replace_flags([H|T0], T, All, Type) :-
  521    (   all_flag(H, All)
  522    ->  true
  523    ;   type_flag(H, Type)
  524    ),
  525    !,
  526    replace_flags(T0, T, All, Type).
  527replace_flags([H|T0], [H|T], All, Type) :-
  528    replace_flags(T0, T, All, Type).
  529
  530all_flag(g, all).
  531
  532type_flag(a, atom).
  533type_flag(s, string).
  534
  535%! default(?Val, +Default) is det.
  536%  If Val isn't instantiated, instantiate it to Default.
  537%  If Val is already instantiated, succeed.
  538%  Equivalent to:
  539%     default( Val,  Default), var(Val) => Val = Default.
  540%     default(_Val, _Default) => true.
  541default(Val, Val) :- !.
  542default(_, _).
  543
  544replace(State, With, Dict, [Skipped|Parts], T) :-
  545    State = state(String, _, _Type),
  546    copy_term(With, r(PartsR, Skel)),
  547    maplist(dict_pair_lookup(Dict), Skel),
  548    range_strings(PartsR, String, Parts, T),
  549    skipped(State, Dict.0, Skipped).
  550
  551% dict_pair_lookup(d{a:1}, a-K) results in K=1.
  552dict_pair_lookup(Dict, Key-Dict.Key).
  553
  554range_strings([], _, T, T).
  555range_strings([Start-Len|T0], String, [S|T1], T) :-
  556    !,
  557    sub_string(String, Start, Len, _, S),
  558    range_strings(T0, String, T1, T).
  559range_strings([S|T0], String, [S|T1], T) :-
  560    range_strings(T0, String, T1, T).
  561
  562parts_to_output(string, Parts, String) :-
  563    atomics_to_string(Parts, String).
  564parts_to_output(atom, Parts, String) :-
  565    atomic_list_concat(Parts, String).
  566
  567%!  compile_replacement(+With, -Compiled)
  568%
  569%   Compile the replacement specification  into a specification that can
  570%   be processed quickly. The compiled expressions are cached and may be
  571%   reclaimed using  re_flush/0 (which also removes  compiled Regex from
  572%   re_compile/3).
  573%
  574%   This "compilation" has nothing to  do with PCRE pattern compilation;
  575%   it's used by re_replace/5 to proces the With argument.
  576
  577:- table compile_replacement/2 as shared.  578
  579compile_replacement(With, r(Parts, Extract)) :-
  580    string_codes(With, Codes),
  581    phrase(replacement_parts(Parts, Pairs), Codes),
  582    % Pairs is LookupKey-Slot pairs, where a LookupKey might be
  583    % duplicated (Slot is a shared variable within Parts).
  584    Extract = Pairs.
  585
  586replacement_parts(Parts, Extract) -->
  587    string_escape(HCodes),
  588    (   ("\\" ; "$"),
  589        capture_name(Name)
  590    ->  !,
  591        { add_part(HCodes, Parts, T0),
  592          T0 = [Repl|T1],
  593          Extract = [Name-Repl|Extract1]
  594        },
  595        replacement_parts(T1, Extract1)
  596    ;   eos
  597    ->  !,
  598        { add_part(HCodes, Parts, []),
  599          Extract = []
  600        }
  601    ).
  602
  603add_part([], Parts, Parts) :-
  604    !.
  605add_part(Codes, [H|T], T) :-
  606    string_codes(H, Codes).
  607
  608%! string_escape(-Codes)// is nondet.
  609% Similar to dcg_basics:string(Codes) but also escapes "$" and "/"
  610string_escape([]) -->
  611    [].
  612string_escape([0'$|T]) -->
  613    "$$", !,
  614    string_escape(T).
  615string_escape([0'\\|T]) -->
  616    "\\\\", !,
  617    string_escape(T).
  618string_escape([H|T]) -->
  619    [H],
  620    string_escape(T).
  621
  622capture_name(Name) -->
  623    "{",
  624    (   digit(D0)
  625    ->  digits(DL),
  626        "}",
  627        { number_codes(Name, [D0|DL]) }
  628    ;   letter(A0),
  629        alnums(AL),
  630        "}",
  631        { atom_codes(Name, [A0|AL]) }
  632    ).
  633capture_name(Name) -->
  634    digit(D0),
  635    !,
  636    digits(DL),
  637    { number_codes(Name, [D0|DL]) }.
  638capture_name(Name) -->
  639    letter(A0),
  640    !,
  641    alnums(AL),
  642    { atom_codes(Name, [A0|AL]) }.
  643
  644letter(L) -->
  645    [L],
  646    { between(0'a,0'z,L)
  647    ; between(0'A,0'Z,L)
  648    ; L == 0'_
  649    }, !.
  650
  651alnums([H|T]) -->
  652    alnum(H),
  653    !,
  654    alnums(T).
  655alnums([]) -->
  656    "".
  657
  658alnum(L) -->
  659    [L],
  660    { between(0'a,0'z,L)
  661    ; between(0'A,0'Z,L)
  662    ; between(0'0,0'9,L)
  663    ; L == 0'_
  664    }, !.
  665
  666%!  re_compile(+Pattern, -Regex, +Options) is det.
  667%
  668%   Compiles Pattern  to a  Regex _blob_ of  type `regex`  (see blob/2).
  669%   Defined  Options are  given below.   Please consult  the [PCRE2  API
  670%   documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  671%   for details.  If an option is  repeated, the first value is used and
  672%   subsequent values  are ignored.   Unrecognized options  are ignored.
  673%   Unless otherwise specified, boolean options default to `false`. Some
  674%   options may not exist on your system, depending on the PCRE2 version
  675%   and  how it  was  built  - these  unsupported  options are  silently
  676%   ignored.
  677%
  678%   The various matching predicates can take  either a Regex _blob_ or a
  679%   string  pattern; if  they  are  given a  string  pattern, they  call
  680%   re_compile/3 and cache the result; so, there is little reason to use
  681%   re_compile/3 directly.
  682%
  683%     * anchored(Bool)
  684%     If `true`, match only at the first position
  685%     * auto_capture(Bool)
  686%     Enable use of numbered capturing parentheses.
  687%     (default `true`)
  688%     * bsr(Mode)
  689%     If `anycrlf`, \R only matches CR, LF or CRLF;  if `unicode`,
  690%     \R matches all Unicode line endings.
  691%     * bsr2(Mode) - synonym for bsr(Mode).
  692%     * caseless(Bool)
  693%     If `true`, do caseless matching.
  694%     * compat(With)
  695%     Error   -   PCRE1   had  =|compat(javascript)|=   for   JavaScript
  696%     compatibility, but PCRE2 has removed that.
  697%     * dollar_endonly(Bool)
  698%     If `true`, $ not to match newline at end
  699%     * dotall(Bool)
  700%     If `true`, . matches anything including NL
  701%     * dupnames(Bool)
  702%     If `true`, allow duplicate names for subpatterns
  703%     * extended(Bool)
  704%     If `true`, ignore white space and # comments
  705%     * firstline(Bool)
  706%     If `true`, force matching to be before newline
  707%     * greedy(Bool)
  708%     If  `true`,  operators such  as  `+`  and  `*` are  greedy  unless
  709%     followed by `?`; if `false`, the  operators are not greedy and `?`
  710%     has the opposite meaning. It can also beset by a `(?U)` within the
  711%     pattern  -   see  the  [PCRE2  pattern   internal  option  setting
  712%     documentation](https://www.pcre.org/current/doc/html/pcre2pattern.html#SEC13)
  713%     for details and note that the PCRE2 option is `UNGREEDY`, which is
  714%     the inverse of this packages `greedy` options.  (default `true`)
  715%     * compat(With)
  716%     Raises an  errr - PCRE1 had  =|compat(javascript)|= for JavaScript
  717%     compatibility, but PCRE2 has removed that option . Consider using
  718%     the `alt_bsux` and `extra_alt_bsux` options.
  719%     * multiline(Bool)
  720%     If `true`, ^ and $ match newlines within data
  721%     * newline(Mode)
  722%     If  `any`, recognize  any Unicode  newline sequence;  if `anycrlf`
  723%     (default), recognize  CR, LF,  and CRLF  as newline  sequences; if
  724%     `cr`, recognize CR;  if `lf`, recognize LF;  `crlf` recognize CRLF
  725%     as  newline; if  `nul`,  recognize the  NULL  character (0x00)  as
  726%     newline.
  727%     * newline2(Mode) - synonym for newline(Mode).
  728%     * ucp(Bool)
  729%     If `true`, use Unicode properties for \d, \w, etc.
  730%     * utf_check(Bool) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  731%     You should not need this because SWI-Prolog ensures that the UTF8 strings are valid,
  732%     * endanchored(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  733%     * allow_empty_class(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  734%     * alt_bsux(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  735%     * auto_callout(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  736%     * match_unset_backref(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  737%     * never_ucp(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  738%     * never_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  739%     * auto_possess(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  740%     (default `true`)
  741%     * dotstar_anchor(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  742%     (default `true`)
  743%     * start_optimize(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  744%     (default `true`)
  745%     * utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  746%     * never_backslash_c(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  747%     * alt_circumflex(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  748%     * alt_verbnames(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  749%     * use_offset_limit(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  750%     * extended_more(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  751%     * literal(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  752%     * match_invalid_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  753%     * jit_complete(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  754%     * jit_partial_soft(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  755%     * jit_partial_hard(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  756%     * jit_invalid_utf(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  757%     * jit(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  758%     (default `true`)
  759%     * copy_matched_subject(boolean) - see [PCRE2 API documentation](https://www.pcre.org/current/doc/html/pcre2api.html)
  760%
  761%   In addition to the options above that directly map to PCRE flags the
  762%   following options are processed:
  763%
  764%     * optimise(Bool) or optimize(Bool)
  765%     Turns on the JIT compiler for additional optimization that greatly
  766%     that speeds  up the matching  performance of many  patterns. (Note
  767%     that he meaning has changed slightly from the PCRE1 implementation
  768%     - PCRE2  always optimises  where possible;  this is  an additional
  769%     optimisation.)
  770%     * capture_type(+Type)
  771%     How to return the matched part  of the input and possibly captured
  772%     groups in there.  Possible values are:
  773%       - string
  774%       Return the captured string as a string (default).
  775%       - atom
  776%       Return the captured string as an atom.
  777%       - range
  778%       Return the captured string as a pair `Start-Length`.  Note that
  779%       we use `Start-Length` rather than the more conventional
  780%       `Start-End` to allow for immediate use with sub_atom/5 and
  781%       sub_string/5.
  782%       - term
  783%       Parse the  captured string  as a Prolog  term.  This  is notably
  784%       practical if you capture a number.
  785%
  786%    The  `capture_type` specifies  the default  for this  pattern.  The
  787%    interface supports  a different type  for each _named_  group using
  788%    the syntax  `(?<name_T>...)`, where `T`  is one of  ``S`` (string),
  789%    ``A`` (atom), ``I`` (integer), ``F`` (float), ``N`` (number), ``T``
  790%    (term)  and ``R``  (range).  In the  current implementation  ``I``,
  791%    ``F`` and  ``N`` are synonyms  for ``T``.  Future versions  may act
  792%    different if the parsed value is not of the requested numeric type.
  793%
  794%    Note that re_compile/3 does not support the Pattern/Flags form that
  795%    is supported by re_match/3, re_replace/4, etc.; the Pattern must be
  796%    text and all compile options specified in Options.
  797
  798%!  re_compiled(+Spec, --Regex, +Options) is det.
  799%
  800%   Create  a  compiled regex  from  a  specification.  Cached  compiled
  801%   regular expressions  can be  reclaimed using re_flush/0  (which also
  802%   removes   "compiled"   With    arguments   from   re_replace/4   and
  803%   re_replace/5).
  804
  805:- table re_compiled_/4 as shared.  806
  807re_compiled(RegexIn, Regex, Options) :-
  808    (   blob(RegexIn, regex)
  809    ->  Regex = RegexIn
  810    ;   RegexIn = Text/Flags
  811    ->  re_compiled_(Text, Flags, Regex, Options)
  812    ;   re_compiled_(RegexIn, '', Regex, Options)
  813    ).
  814
  815re_compiled_(Text, Flags, Regex, Options) =>
  816    must_be(text, Text),
  817    must_be(atom, Flags),
  818    re_flags_options(Flags, Options0),
  819    append(Options0, Options, Options2),
  820    re_compile(Text, Regex, Options2).
  821
  822re_flags_options(Flags, Options) :-
  823    atom_chars(Flags, Chars),
  824    maplist(re_flag_option, Chars, Options).
  825
  826re_flag_option(Flag, Option) :-
  827    re_flag_option_(Flag, Option),
  828    !.
  829re_flag_option(Flag, _) :-
  830    existence_error(re_flag, Flag).
  831
  832re_flag_option_(i, caseless(true)).
  833re_flag_option_(m, multiline(true)).
  834re_flag_option_(x, extended(true)).
  835re_flag_option_(s, dotall(true)).
  836re_flag_option_(a, capture_type(atom)).
  837re_flag_option_(r, capture_type(range)).
  838re_flag_option_(t, capture_type(term)).
  839
  840%!  re_flush
  841%
  842%   Clean pattern and replacement caches.
  843%
  844%   @tbd Flush automatically if the cache becomes too large.
  845
  846re_flush :-
  847    abolish_module_tables(pcre).
  848
  849%!  re_config(+Term)
  850%
  851%   Extract configuration information from the  pcre library. Term is of
  852%   the   form    ``Name(Value)``.    Name    is   derived    from   the
  853%   ``PCRE_CONFIG_*``  constant  after   removing  ``PCRE_CONFIG_``  and
  854%   mapping the name to  lower case, e.g.  `utf8`, `unicode_properties`,
  855%   etc.  Value is a Prolog boolean, integer, or atom. For boolean (1 or
  856%   0) values, `true` or `false` is returned.
  857%
  858%   re_config/1 will  backtrack through  all the  possible configuration
  859%   values  if its  argument  is a  variable. If  an  unknown option  is
  860%   specified, re_config/1 fails.
  861%
  862%   Non-compatible  changes  between  PCRE1 and  PCRE2  because  numeric
  863%   values changed: `bsr` and `newline` have been replaced by `bsr2` and
  864%   `newline2`:
  865%     * `bsr2` - previously `bsr` returned 0 or 1; now returns `unicode`
  866%       or `anycrlf`
  867%     * `newline2`  -  previously  `newline` returned  an  integer,  now
  868%       returns `cr`, `lf`, `crlf`, `any`, `anycrlf`, `nul`
  869%
  870%  Term values are as follows. Some values might not exist, depending on
  871%  the version of PCRE2 and the options it was built with.
  872%
  873%   * bsr2
  874%     The character  sequences that the `\R` escape sequence  matches by
  875%     default. Replaces `bsr` option from PCRE1, which is not compatible.
  876%   * compiled_widths
  877%     An integer whose  lower bits indicate which code  unit widths were
  878%     selected when PCRE2 was built.  The 1-bit indicates 8-bit support,
  879%     and  the  2-bit and  4-bit  indicate  16-bit and  32-bit  support,
  880%     respectively. The 1  bit should always be set  because the wrapper
  881%     code requires 8 bit support.
  882%   * depthlimit
  883%   * heaplimit
  884%   * jit
  885%     `true` if just-in-time compiling is available.
  886%   * jittarget
  887%     A string containing the name of the architecture for which the JIT
  888%     compiler is configured. e.g., 'x86 64bit (little endian + unaligned)'.
  889%   * linksize
  890%   * matchlimit
  891%   * never_backslash_c
  892%   * newline2
  893%     An atom whose value specifies  the default character sequence that
  894%     is  recognized as  meaning "newline"  (`cr`, `lf`,  `crlf`, `any`,
  895%     `anycrlf`, `nul`).  Replaces `newline` option from PCRE1, which is
  896%     not compatible.
  897%   * parenslimit
  898%   * stackrecurse
  899%   * unicode
  900%     Always `true`
  901%   * unicode_version
  902%     The unicode version as an atom, e.g. '12.1.0'.
  903%   * utf8 - synonym for `unicode`
  904%   * parens_limit
  905%   * version
  906%   The  version information  as an  atom, containing  the PCRE  version
  907%   number and release date, e.g. '10.34 2019-11-21'.
  908%
  909%   For backwards compatibility with  PCRE1, the following are accepted,
  910%   but are deprecated:
  911%     * `utf8` - synonym for `unicode`
  912%     * `link_size` - synonym for `linksize`
  913%     * `match_limit` - synonym for `matchlimit`
  914%     * `parens_limit` - synonym for `parenslimit`
  915%     * `unicode_properties` - always true
  916%   The following  have been removed  because they don't exist  in PCRE2
  917%   and don't seem to have any meaningful use in PCRE1:
  918%     * `posix_malloc_threshold`
  919%     * `match_limit_recursion`
  920
  921%   @see `man pcre2api` for details
  922
  923re_config(Term), var(Term) =>
  924    re_config_choice(Term),
  925    % This code depends on re_config_/1 failing if it's given an invalid
  926    % Term (e.g., re_config_(jittarget(_)) fails if jit(false)). If
  927    % re_config_/1 is changed to throw an error, then the following call
  928    % needs to be inside catch/3.
  929    re_config_(Term).
  930re_config(Term) =>
  931    re_config_(Term)