mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 06:53:33 +00:00
Add PCRE2 library to third_party
This commit is contained in:
parent
4f66d7f2dd
commit
b27e9fe845
47 changed files with 58274 additions and 0 deletions
1
Makefile
1
Makefile
|
@ -225,6 +225,7 @@ include third_party/stb/BUILD.mk # │
|
|||
include third_party/mbedtls/BUILD.mk # │
|
||||
include third_party/libcxx/BUILD.mk # │
|
||||
include third_party/ggml/BUILD.mk # │
|
||||
include third_party/pcre/BUILD.mk # │
|
||||
include third_party/radpajama/BUILD.mk # │
|
||||
include net/https/BUILD.mk # │
|
||||
include third_party/regex/BUILD.mk #─┘
|
||||
|
|
1
third_party/BUILD.mk
vendored
1
third_party/BUILD.mk
vendored
|
@ -24,6 +24,7 @@ o/$(MODE)/third_party: \
|
|||
o/$(MODE)/third_party/mbedtls \
|
||||
o/$(MODE)/third_party/musl \
|
||||
o/$(MODE)/third_party/nsync \
|
||||
o/$(MODE)/third_party/pcre \
|
||||
o/$(MODE)/third_party/puff \
|
||||
o/$(MODE)/third_party/python \
|
||||
o/$(MODE)/third_party/quickjs \
|
||||
|
|
69
third_party/pcre/BUILD.mk
vendored
Normal file
69
third_party/pcre/BUILD.mk
vendored
Normal file
|
@ -0,0 +1,69 @@
|
|||
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
|
||||
#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
|
||||
|
||||
PKGS += THIRD_PARTY_PCRE
|
||||
|
||||
THIRD_PARTY_PCRE_ARTIFACTS += THIRD_PARTY_PCRE_A
|
||||
THIRD_PARTY_PCRE = $(THIRD_PARTY_PCRE_A_DEPS) $(THIRD_PARTY_PCRE_A)
|
||||
THIRD_PARTY_PCRE_A = o/$(MODE)/third_party/pcre/pcre.a
|
||||
THIRD_PARTY_PCRE_A_FILES := $(wildcard third_party/pcre/*)
|
||||
THIRD_PARTY_PCRE_A_HDRS = $(filter %.h,$(THIRD_PARTY_PCRE_A_FILES))
|
||||
THIRD_PARTY_PCRE_A_INCS = $(filter %.inc,$(THIRD_PARTY_PCRE_A_FILES))
|
||||
THIRD_PARTY_PCRE_A_SRCS = $(filter %.c,$(THIRD_PARTY_PCRE_A_FILES))
|
||||
THIRD_PARTY_PCRE_A_OBJS = $(THIRD_PARTY_PCRE_A_SRCS:%.c=o/$(MODE)/%.o)
|
||||
|
||||
THIRD_PARTY_PCRE_A_CHECKS = \
|
||||
$(THIRD_PARTY_PCRE_A).pkg \
|
||||
o/$(MODE)/third_party/pcre/pcre2posix_test.com.runs
|
||||
|
||||
THIRD_PARTY_PCRE_A_DIRECTDEPS = \
|
||||
LIBC_CALLS \
|
||||
LIBC_FMT \
|
||||
LIBC_INTRIN \
|
||||
LIBC_MEM \
|
||||
LIBC_NEXGEN32E \
|
||||
LIBC_PROC \
|
||||
LIBC_RUNTIME \
|
||||
LIBC_STDIO \
|
||||
LIBC_STR \
|
||||
LIBC_SYSV
|
||||
|
||||
THIRD_PARTY_PCRE_A_DEPS := \
|
||||
$(call uniq,$(foreach x,$(THIRD_PARTY_PCRE_A_DIRECTDEPS),$($(x))))
|
||||
|
||||
$(THIRD_PARTY_PCRE_A): \
|
||||
third_party/pcre/ \
|
||||
$(THIRD_PARTY_PCRE_A).pkg \
|
||||
$(THIRD_PARTY_PCRE_A_OBJS)
|
||||
|
||||
$(THIRD_PARTY_PCRE_A).pkg: \
|
||||
$(THIRD_PARTY_PCRE_A_OBJS) \
|
||||
$(foreach x,$(THIRD_PARTY_PCRE_A_DIRECTDEPS),$($(x)_A).pkg)
|
||||
|
||||
$(THIRD_PARTY_PCRE_A_OBJS): private CPPFLAGS += -DHAVE_CONFIG_H
|
||||
|
||||
o/$(MODE)/third_party/pcre/%.com.dbg: \
|
||||
$(THIRD_PARTY_PCRE) \
|
||||
o/$(MODE)/third_party/pcre/%.o \
|
||||
$(CRT) \
|
||||
$(APE_NO_MODIFY_SELF)
|
||||
@$(APELINK)
|
||||
|
||||
THIRD_PARTY_PCRE_COMS = \
|
||||
o/$(MODE)/third_party/pcre/pcre2grep.com \
|
||||
o/$(MODE)/third_party/pcre/pcre2posix_test.com
|
||||
|
||||
THIRD_PARTY_PCRE_BINS = $(THIRD_PARTY_PCRE_COMS) $(THIRD_PARTY_PCRE_COMS:%=%.dbg)
|
||||
THIRD_PARTY_PCRE_LIBS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)))
|
||||
THIRD_PARTY_PCRE_SRCS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)_SRCS))
|
||||
THIRD_PARTY_PCRE_HDRS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)_HDRS))
|
||||
THIRD_PARTY_PCRE_INCS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)_INCS))
|
||||
THIRD_PARTY_PCRE_CHECKS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)_CHECKS))
|
||||
THIRD_PARTY_PCRE_OBJS = $(foreach x,$(THIRD_PARTY_PCRE_ARTIFACTS),$($(x)_OBJS))
|
||||
$(THIRD_PARTY_PCRE_OBJS): $(BUILD_FILES) third_party/pcre/BUILD.mk
|
||||
|
||||
.PHONY: o/$(MODE)/third_party/pcre
|
||||
o/$(MODE)/third_party/pcre: \
|
||||
$(THIRD_PARTY_PCRE_A) \
|
||||
$(THIRD_PARTY_PCRE_BINS) \
|
||||
$(THIRD_PARTY_PCRE_CHECKS)
|
94
third_party/pcre/LICENCE
vendored
Normal file
94
third_party/pcre/LICENCE
vendored
Normal file
|
@ -0,0 +1,94 @@
|
|||
PCRE2 LICENCE
|
||||
-------------
|
||||
|
||||
PCRE2 is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Releases 10.00 and above of PCRE2 are distributed under the terms of the "BSD"
|
||||
licence, as specified below, with one exemption for certain binary
|
||||
redistributions. The documentation for PCRE2, supplied in the "doc" directory,
|
||||
is distributed under the same terms as the software itself. The data in the
|
||||
testdata directory is not copyrighted and is in the public domain.
|
||||
|
||||
The basic library functions are written in C and are freestanding. Also
|
||||
included in the distribution is a just-in-time compiler that can be used to
|
||||
optimize pattern matching. This is an optional feature that can be omitted when
|
||||
the library is built.
|
||||
|
||||
|
||||
THE BASIC LIBRARY FUNCTIONS
|
||||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
PCRE2 JUST-IN-TIME COMPILATION SUPPORT
|
||||
--------------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
STACK-LESS JUST-IN-TIME COMPILER
|
||||
--------------------------------
|
||||
|
||||
Written by: Zoltan Herczeg
|
||||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
THE "BSD" LICENCE
|
||||
-----------------
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notices,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notices, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of any
|
||||
contributors may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
EXEMPTION FOR BINARY LIBRARY-LIKE PACKAGES
|
||||
------------------------------------------
|
||||
|
||||
The second condition in the BSD licence (covering binary redistributions) does
|
||||
not apply all the way down a chain of software. If binary package A includes
|
||||
PCRE2, it must respect the condition, but if package B is software that
|
||||
includes package A, the condition is not imposed on package B unless it uses
|
||||
PCRE2 independently.
|
||||
|
||||
End
|
15
third_party/pcre/README.cosmo
vendored
Normal file
15
third_party/pcre/README.cosmo
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
DESCRIPTION
|
||||
|
||||
PCRE2 is a regular expression library
|
||||
|
||||
LICENSE
|
||||
|
||||
PCRE2 License
|
||||
|
||||
ORIGIN
|
||||
|
||||
https://github.com/PCRE2Project/pcre2/archive/refs/tags/pcre2-10.42.tar.gz
|
||||
|
||||
LOCAL CHANGES
|
||||
|
||||
Removed JIT code.
|
446
third_party/pcre/config.h
vendored
Normal file
446
third_party/pcre/config.h
vendored
Normal file
|
@ -0,0 +1,446 @@
|
|||
/* src/config.h. Generated from config.h.in by configure. */
|
||||
/* src/config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
|
||||
/* PCRE2 is written in Standard C, but there are a few non-standard things it
|
||||
can cope with, allowing it to run on SunOS4 and other "close to standard"
|
||||
systems.
|
||||
|
||||
In environments that support the GNU autotools, config.h.in is converted into
|
||||
config.h by the "configure" script. In environments that use CMake,
|
||||
config-cmake.in is converted into config.h. If you are going to build PCRE2 "by
|
||||
hand" without using "configure" or CMake, you should copy the distributed
|
||||
config.h.generic to config.h, and edit the macro definitions to be the way you
|
||||
need them. You must then add -DHAVE_CONFIG_H to all of your compile commands,
|
||||
so that config.h is included at the start of every source.
|
||||
|
||||
Alternatively, you can avoid editing by using -D on the compiler command line
|
||||
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H,
|
||||
but if you do, default values will be taken from config.h for non-boolean
|
||||
macros that are not defined on the command line.
|
||||
|
||||
Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be
|
||||
defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All
|
||||
such macros are listed as a commented #undef in config.h.generic. Macros such
|
||||
as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
|
||||
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
|
||||
|
||||
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
|
||||
HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
|
||||
sure both macros are undefined; an emulation function will then be used. */
|
||||
|
||||
/* By default, the \R escape sequence matches any Unicode line ending
|
||||
character or sequence of characters. If BSR_ANYCRLF is defined (to any
|
||||
value), this is changed so that backslash-R matches only CR, LF, or CRLF.
|
||||
The build-time default can be overridden by the user of PCRE2 at runtime.
|
||||
*/
|
||||
/* #undef BSR_ANYCRLF */
|
||||
|
||||
/* Define to any value to disable the use of the z and t modifiers in
|
||||
formatting settings such as %zu or %td (this is rarely needed). */
|
||||
/* #undef DISABLE_PERCENT_ZT */
|
||||
|
||||
/* If you are compiling for a system that uses EBCDIC instead of ASCII
|
||||
character codes, define this macro to any value. When EBCDIC is set, PCRE2
|
||||
assumes that all input strings are in EBCDIC. If you do not define this
|
||||
macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It
|
||||
is not possible to build a version of PCRE2 that supports both EBCDIC and
|
||||
UTF-8/16/32. */
|
||||
/* #undef EBCDIC */
|
||||
|
||||
/* In an EBCDIC environment, define this macro to any value to arrange for the
|
||||
NL character to be 0x25 instead of the default 0x15. NL plays the role that
|
||||
LF does in an ASCII/Unicode environment. */
|
||||
/* #undef EBCDIC_NL25 */
|
||||
|
||||
/* Define this if your compiler supports __attribute__((uninitialized)) */
|
||||
#define HAVE_ATTRIBUTE_UNINITIALIZED 1
|
||||
|
||||
/* Define to 1 if you have the `bcopy' function. */
|
||||
#define HAVE_BCOPY 1
|
||||
|
||||
/* Define to 1 if you have the <bzlib.h> header file. */
|
||||
/* #undef HAVE_BZLIB_H */
|
||||
|
||||
/* Define to 1 if you have the <dirent.h> header file. */
|
||||
#define HAVE_DIRENT_H 1
|
||||
|
||||
/* Define to 1 if you have the <dlfcn.h> header file. */
|
||||
#define HAVE_DLFCN_H 1
|
||||
|
||||
/* Define to 1 if you have the <editline/readline.h> header file. */
|
||||
/* #undef HAVE_EDITLINE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the <edit/readline/readline.h> header file. */
|
||||
/* #undef HAVE_EDIT_READLINE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <limits.h> header file. */
|
||||
#define HAVE_LIMITS_H 1
|
||||
|
||||
/* Define to 1 if you have the `memfd_create' function. */
|
||||
#define HAVE_MEMFD_CREATE 1
|
||||
|
||||
/* Define to 1 if you have the `memmove' function. */
|
||||
#define HAVE_MEMMOVE 1
|
||||
|
||||
/* Define to 1 if you have the <minix/config.h> header file. */
|
||||
/* #undef HAVE_MINIX_CONFIG_H */
|
||||
|
||||
/* Define to 1 if you have the `mkostemp' function. */
|
||||
#define HAVE_MKOSTEMP 1
|
||||
|
||||
/* Define if you have POSIX threads libraries and header files. */
|
||||
/* #undef HAVE_PTHREAD */
|
||||
|
||||
/* Have PTHREAD_PRIO_INHERIT. */
|
||||
/* #undef HAVE_PTHREAD_PRIO_INHERIT */
|
||||
|
||||
/* Define to 1 if you have the <readline.h> header file. */
|
||||
/* #undef HAVE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
/* #undef HAVE_READLINE_HISTORY_H */
|
||||
|
||||
/* Define to 1 if you have the <readline/readline.h> header file. */
|
||||
/* #undef HAVE_READLINE_READLINE_H */
|
||||
|
||||
/* Define to 1 if you have the `realpath' function. */
|
||||
#define HAVE_REALPATH 1
|
||||
|
||||
/* Define to 1 if you have the `secure_getenv' function. */
|
||||
#define HAVE_SECURE_GETENV 1
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#define HAVE_STDINT_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdio.h> header file. */
|
||||
#define HAVE_STDIO_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#define HAVE_STDLIB_H 1
|
||||
|
||||
/* Define to 1 if you have the `strerror' function. */
|
||||
#define HAVE_STRERROR 1
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#define HAVE_STRINGS_H 1
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#define HAVE_STRING_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/wait.h> header file. */
|
||||
#define HAVE_SYS_WAIT_H 1
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#define HAVE_UNISTD_H 1
|
||||
|
||||
/* Define to 1 if the compiler supports simple visibility declarations. */
|
||||
#define HAVE_VISIBILITY 1
|
||||
|
||||
/* Define to 1 if you have the <wchar.h> header file. */
|
||||
#define HAVE_WCHAR_H 1
|
||||
|
||||
/* Define to 1 if you have the <windows.h> header file. */
|
||||
/* #undef HAVE_WINDOWS_H */
|
||||
|
||||
/* Define to 1 if you have the <zlib.h> header file. */
|
||||
/* #undef HAVE_ZLIB_H */
|
||||
|
||||
/* This limits the amount of memory that may be used while matching a pattern.
|
||||
It applies to both pcre2_match() and pcre2_dfa_match(). It does not apply
|
||||
to JIT matching. The value is in kibibytes (units of 1024 bytes). */
|
||||
#define HEAP_LIMIT 20000000
|
||||
|
||||
/* The value of LINK_SIZE determines the number of bytes used to store links
|
||||
as offsets within the compiled regex. The default is 2, which allows for
|
||||
compiled patterns up to 65535 code units long. This covers the vast
|
||||
majority of cases. However, PCRE2 can also be compiled to use 3 or 4 bytes
|
||||
instead. This allows for longer patterns in extreme cases. */
|
||||
#define LINK_SIZE 2
|
||||
|
||||
/* Define to the sub-directory where libtool stores uninstalled libraries. */
|
||||
#define LT_OBJDIR ".libs/"
|
||||
|
||||
/* The value of MATCH_LIMIT determines the default number of times the
|
||||
pcre2_match() function can record a backtrack position during a single
|
||||
matching attempt. The value is also used to limit a loop counter in
|
||||
pcre2_dfa_match(). There is a runtime interface for setting a different
|
||||
limit. The limit exists in order to catch runaway regular expressions that
|
||||
take for ever to determine that they do not match. The default is set very
|
||||
large so that it does not accidentally catch legitimate cases. */
|
||||
#define MATCH_LIMIT 10000000
|
||||
|
||||
/* The above limit applies to all backtracks, whether or not they are nested.
|
||||
In some environments it is desirable to limit the nesting of backtracking
|
||||
(that is, the depth of tree that is searched) more strictly, in order to
|
||||
restrict the maximum amount of heap memory that is used. The value of
|
||||
MATCH_LIMIT_DEPTH provides this facility. To have any useful effect, it
|
||||
must be less than the value of MATCH_LIMIT. The default is to use the same
|
||||
value as MATCH_LIMIT. There is a runtime method for setting a different
|
||||
limit. In the case of pcre2_dfa_match(), this limit controls the depth of
|
||||
the internal nested function calls that are used for pattern recursions,
|
||||
lookarounds, and atomic groups. */
|
||||
#define MATCH_LIMIT_DEPTH MATCH_LIMIT
|
||||
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#define MAX_NAME_COUNT 10000
|
||||
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#define MAX_NAME_SIZE 32
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
/* #undef NEVER_BACKSLASH_C */
|
||||
|
||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||
sequence. PCRE2 client programs can override this by selecting other values
|
||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), 5
|
||||
(ANYCRLF), and 6 (NUL). */
|
||||
#define NEWLINE_DEFAULT 2
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "pcre2"
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#define PACKAGE_BUGREPORT ""
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.42"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.42"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
stack that is used while compiling a pattern. */
|
||||
#define PARENS_NEST_LIMIT 250
|
||||
|
||||
/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. The buffer will be
|
||||
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
|
||||
very long lines. The actual amount of memory used by pcre2grep is three
|
||||
times this number, because it allows for the buffering of "before" and
|
||||
"after" lines. */
|
||||
#define PCRE2GREP_BUFSIZE 20480
|
||||
|
||||
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
|
||||
used by pcre2grep to hold parts of the file it is searching. The actual
|
||||
amount of memory used by pcre2grep is three times this number, because it
|
||||
allows for the buffering of "before" and "after" lines. */
|
||||
#define PCRE2GREP_MAX_BUFSIZE 1048576
|
||||
|
||||
/* to make a symbol visible */
|
||||
#define PCRE2POSIX_EXP_DECL extern __attribute__ ((visibility ("default")))
|
||||
|
||||
/* to make a symbol visible */
|
||||
#define PCRE2POSIX_EXP_DEFN extern __attribute__ ((visibility ("default")))
|
||||
|
||||
/* Define to any value to include debugging code. */
|
||||
/* #undef PCRE2_DEBUG */
|
||||
|
||||
/* to make a symbol visible */
|
||||
#define PCRE2_EXP_DECL extern __attribute__ ((visibility ("default")))
|
||||
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
Win32, and it needs some magic to be inserted before the definition
|
||||
of a function that is exported by the library, define this macro to
|
||||
contain the relevant magic. If you do not define this macro, a suitable
|
||||
__declspec value is used for Windows systems; in other environments
|
||||
"extern" is used for a C compiler and "extern C" for a C++ compiler.
|
||||
This macro apears at the start of every exported function that is part
|
||||
of the external API. It does not appear on functions that are "external"
|
||||
in the C sense, but which are internal to the library. */
|
||||
#define PCRE2_EXP_DEFN __attribute__ ((visibility ("default")))
|
||||
|
||||
/* Define to any value if linking statically (TODO: make nice with Libtool) */
|
||||
#define PCRE2_STATIC 1
|
||||
|
||||
/* Define to necessary symbol if this constant uses a non-standard name on
|
||||
your system. */
|
||||
/* #undef PTHREAD_CREATE_JOINABLE */
|
||||
|
||||
/* Define to any non-zero number to enable support for SELinux compatible
|
||||
executable memory allocator in JIT. Note that this will have no effect
|
||||
unless SUPPORT_JIT is also defined. */
|
||||
/* #undef SLJIT_PROT_EXECUTABLE_ALLOCATOR */
|
||||
|
||||
/* Define to 1 if all of the C90 standard headers exist (not just the ones
|
||||
required in a freestanding environment). This macro is provided for
|
||||
backward compatibility; new code need not use it. */
|
||||
#define STDC_HEADERS 1
|
||||
|
||||
/* Define to any value to enable support for Just-In-Time compiling. */
|
||||
/* #undef SUPPORT_JIT */
|
||||
|
||||
/* Define to any value to allow pcre2grep to be linked with libbz2, so that it
|
||||
is able to handle .bz2 files. */
|
||||
/* #undef SUPPORT_LIBBZ2 */
|
||||
|
||||
/* Define to any value to allow pcre2test to be linked with libedit. */
|
||||
/* #undef SUPPORT_LIBEDIT */
|
||||
|
||||
/* Define to any value to allow pcre2test to be linked with libreadline. */
|
||||
/* #undef SUPPORT_LIBREADLINE */
|
||||
|
||||
/* Define to any value to allow pcre2grep to be linked with libz, so that it
|
||||
is able to handle .gz files. */
|
||||
/* #undef SUPPORT_LIBZ */
|
||||
|
||||
/* Define to any value to enable callout script support in pcre2grep. */
|
||||
#define SUPPORT_PCRE2GREP_CALLOUT /**/
|
||||
|
||||
/* Define to any value to enable fork support in pcre2grep callout scripts.
|
||||
This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also defined.
|
||||
*/
|
||||
#define SUPPORT_PCRE2GREP_CALLOUT_FORK /**/
|
||||
|
||||
/* Define to any value to enable JIT support in pcre2grep. Note that this will
|
||||
have no effect unless SUPPORT_JIT is also defined. */
|
||||
/* #undef SUPPORT_PCRE2GREP_JIT */
|
||||
|
||||
/* Define to any value to enable the 16 bit PCRE2 library. */
|
||||
/* #undef SUPPORT_PCRE2_16 */
|
||||
|
||||
/* Define to any value to enable the 32 bit PCRE2 library. */
|
||||
/* #undef SUPPORT_PCRE2_32 */
|
||||
|
||||
/* Define to any value to enable the 8 bit PCRE2 library. */
|
||||
#define SUPPORT_PCRE2_8 /**/
|
||||
|
||||
/* Define to any value to enable support for Unicode and UTF encoding. This
|
||||
will work even in an EBCDIC environment, but it is incompatible with the
|
||||
EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or*
|
||||
ASCII/Unicode, but not both at once. */
|
||||
#define SUPPORT_UNICODE /**/
|
||||
|
||||
/* Define to any value for valgrind support to find invalid memory reads. */
|
||||
/* #undef SUPPORT_VALGRIND */
|
||||
|
||||
/* Enable extensions on AIX 3, Interix. */
|
||||
#ifndef _ALL_SOURCE
|
||||
# define _ALL_SOURCE 1
|
||||
#endif
|
||||
/* Enable general extensions on macOS. */
|
||||
#ifndef _DARWIN_C_SOURCE
|
||||
# define _DARWIN_C_SOURCE 1
|
||||
#endif
|
||||
/* Enable general extensions on Solaris. */
|
||||
#ifndef __EXTENSIONS__
|
||||
# define __EXTENSIONS__ 1
|
||||
#endif
|
||||
/* Enable GNU extensions on systems that have them. */
|
||||
#ifndef _GNU_SOURCE
|
||||
# define _GNU_SOURCE 1
|
||||
#endif
|
||||
/* Enable X/Open compliant socket functions that do not require linking
|
||||
with -lxnet on HP-UX 11.11. */
|
||||
#ifndef _HPUX_ALT_XOPEN_SOCKET_API
|
||||
# define _HPUX_ALT_XOPEN_SOCKET_API 1
|
||||
#endif
|
||||
/* Identify the host operating system as Minix.
|
||||
This macro does not affect the system headers' behavior.
|
||||
A future release of Autoconf may stop defining this macro. */
|
||||
#ifndef _MINIX
|
||||
/* # undef _MINIX */
|
||||
#endif
|
||||
/* Enable general extensions on NetBSD.
|
||||
Enable NetBSD compatibility extensions on Minix. */
|
||||
#ifndef _NETBSD_SOURCE
|
||||
# define _NETBSD_SOURCE 1
|
||||
#endif
|
||||
/* Enable OpenBSD compatibility extensions on NetBSD.
|
||||
Oddly enough, this does nothing on OpenBSD. */
|
||||
#ifndef _OPENBSD_SOURCE
|
||||
# define _OPENBSD_SOURCE 1
|
||||
#endif
|
||||
/* Define to 1 if needed for POSIX-compatible behavior. */
|
||||
#ifndef _POSIX_SOURCE
|
||||
/* # undef _POSIX_SOURCE */
|
||||
#endif
|
||||
/* Define to 2 if needed for POSIX-compatible behavior. */
|
||||
#ifndef _POSIX_1_SOURCE
|
||||
/* # undef _POSIX_1_SOURCE */
|
||||
#endif
|
||||
/* Enable POSIX-compatible threading on Solaris. */
|
||||
#ifndef _POSIX_PTHREAD_SEMANTICS
|
||||
# define _POSIX_PTHREAD_SEMANTICS 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-5:2014. */
|
||||
#ifndef __STDC_WANT_IEC_60559_ATTRIBS_EXT__
|
||||
# define __STDC_WANT_IEC_60559_ATTRIBS_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-1:2014. */
|
||||
#ifndef __STDC_WANT_IEC_60559_BFP_EXT__
|
||||
# define __STDC_WANT_IEC_60559_BFP_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-2:2015. */
|
||||
#ifndef __STDC_WANT_IEC_60559_DFP_EXT__
|
||||
# define __STDC_WANT_IEC_60559_DFP_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-4:2015. */
|
||||
#ifndef __STDC_WANT_IEC_60559_FUNCS_EXT__
|
||||
# define __STDC_WANT_IEC_60559_FUNCS_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-3:2015. */
|
||||
#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
|
||||
# define __STDC_WANT_IEC_60559_TYPES_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TR 24731-2:2010. */
|
||||
#ifndef __STDC_WANT_LIB_EXT2__
|
||||
# define __STDC_WANT_LIB_EXT2__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC 24747:2009. */
|
||||
#ifndef __STDC_WANT_MATH_SPEC_FUNCS__
|
||||
# define __STDC_WANT_MATH_SPEC_FUNCS__ 1
|
||||
#endif
|
||||
/* Enable extensions on HP NonStop. */
|
||||
#ifndef _TANDEM_SOURCE
|
||||
# define _TANDEM_SOURCE 1
|
||||
#endif
|
||||
/* Enable X/Open extensions. Define to 500 only if necessary
|
||||
to make mbstate_t available. */
|
||||
#ifndef _XOPEN_SOURCE
|
||||
/* # undef _XOPEN_SOURCE */
|
||||
#endif
|
||||
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.42"
|
||||
|
||||
/* Number of bits in a file offset, on hosts where this is settable. */
|
||||
/* #undef _FILE_OFFSET_BITS */
|
||||
|
||||
/* Define for large files, on AIX-style hosts. */
|
||||
/* #undef _LARGE_FILES */
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
||||
/* Define to the type of a signed integer type of width exactly 64 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
/* #undef int64_t */
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
/* #undef size_t */
|
997
third_party/pcre/pcre2.h
vendored
Normal file
997
third_party/pcre/pcre2.h
vendored
Normal file
|
@ -0,0 +1,997 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* This is the public header file for the PCRE library, second API, to be
|
||||
#included by applications that call PCRE2 functions.
|
||||
|
||||
Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef PCRE2_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_H_IDEMPOTENT_GUARD
|
||||
|
||||
#ifndef PCRE2_CODE_UNIT_WIDTH
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
#endif
|
||||
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 42
|
||||
#define PCRE2_PRERELEASE
|
||||
#define PCRE2_DATE 2022-12-11
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
export setting is defined in pcre2_internal.h, which includes this file. So we
|
||||
don't change existing definitions of PCRE2_EXP_DECL. */
|
||||
|
||||
#if defined(_WIN32) && !defined(PCRE2_STATIC)
|
||||
# ifndef PCRE2_EXP_DECL
|
||||
# define PCRE2_EXP_DECL extern __declspec(dllimport)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* By default, we use the standard "extern" declarations. */
|
||||
|
||||
#ifndef PCRE2_EXP_DECL
|
||||
# ifdef __cplusplus
|
||||
# define PCRE2_EXP_DECL extern "C"
|
||||
# else
|
||||
# define PCRE2_EXP_DECL extern
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* When compiling with the MSVC compiler, it is sometimes necessary to include
|
||||
a "calling convention" before exported function names. (This is secondhand
|
||||
information; I know nothing about MSVC myself). For example, something like
|
||||
|
||||
void __cdecl function(....)
|
||||
|
||||
might be needed. In order so make this easy, all the exported functions have
|
||||
PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
|
||||
set, we ensure here that it has no effect. */
|
||||
|
||||
#ifndef PCRE2_CALL_CONVENTION
|
||||
#define PCRE2_CALL_CONVENTION
|
||||
#endif
|
||||
|
||||
/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and
|
||||
uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do
|
||||
not have stdint.h, which is why we use inttypes.h, which according to the C
|
||||
standard is a superset of stdint.h. If inttypes.h is not available the build
|
||||
will break and the relevant values must be provided by some other means. */
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
/* Allow for C++ users compiling this directly. */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* The following option bits can be passed to pcre2_compile(), pcre2_match(),
|
||||
or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it
|
||||
is passed. Put these bits at the most significant end of the options word so
|
||||
others can be added next to them */
|
||||
|
||||
#define PCRE2_ANCHORED 0x80000000u
|
||||
#define PCRE2_NO_UTF_CHECK 0x40000000u
|
||||
#define PCRE2_ENDANCHORED 0x20000000u
|
||||
|
||||
/* The following option bits can be passed only to pcre2_compile(). However,
|
||||
they may affect compilation, JIT compilation, and/or interpretive execution.
|
||||
The following tags indicate which:
|
||||
|
||||
C alters what is compiled by pcre2_compile()
|
||||
J alters what is compiled by pcre2_jit_compile()
|
||||
M is inspected during pcre2_match() execution
|
||||
D is inspected during pcre2_dfa_match() execution
|
||||
*/
|
||||
|
||||
#define PCRE2_ALLOW_EMPTY_CLASS 0x00000001u /* C */
|
||||
#define PCRE2_ALT_BSUX 0x00000002u /* C */
|
||||
#define PCRE2_AUTO_CALLOUT 0x00000004u /* C */
|
||||
#define PCRE2_CASELESS 0x00000008u /* C */
|
||||
#define PCRE2_DOLLAR_ENDONLY 0x00000010u /* J M D */
|
||||
#define PCRE2_DOTALL 0x00000020u /* C */
|
||||
#define PCRE2_DUPNAMES 0x00000040u /* C */
|
||||
#define PCRE2_EXTENDED 0x00000080u /* C */
|
||||
#define PCRE2_FIRSTLINE 0x00000100u /* J M D */
|
||||
#define PCRE2_MATCH_UNSET_BACKREF 0x00000200u /* C J M */
|
||||
#define PCRE2_MULTILINE 0x00000400u /* C */
|
||||
#define PCRE2_NEVER_UCP 0x00000800u /* C */
|
||||
#define PCRE2_NEVER_UTF 0x00001000u /* C */
|
||||
#define PCRE2_NO_AUTO_CAPTURE 0x00002000u /* C */
|
||||
#define PCRE2_NO_AUTO_POSSESS 0x00004000u /* C */
|
||||
#define PCRE2_NO_DOTSTAR_ANCHOR 0x00008000u /* C */
|
||||
#define PCRE2_NO_START_OPTIMIZE 0x00010000u /* J M D */
|
||||
#define PCRE2_UCP 0x00020000u /* C J M D */
|
||||
#define PCRE2_UNGREEDY 0x00040000u /* C */
|
||||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
||||
#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */
|
||||
#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */
|
||||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
#define PCRE2_LITERAL 0x02000000u /* C */
|
||||
#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
|
||||
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
|
||||
#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */
|
||||
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
|
||||
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
||||
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
|
||||
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
|
||||
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u
|
||||
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
|
||||
#define PCRE2_JIT_INVALID_UTF 0x00000100u
|
||||
|
||||
/* These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
|
||||
pcre2_substitute(). Some are allowed only for one of the functions, and in
|
||||
these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and
|
||||
PCRE2_NO_UTF_CHECK can also be passed to these functions (though
|
||||
pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */
|
||||
|
||||
#define PCRE2_NOTBOL 0x00000001u
|
||||
#define PCRE2_NOTEOL 0x00000002u
|
||||
#define PCRE2_NOTEMPTY 0x00000004u /* ) These two must be kept */
|
||||
#define PCRE2_NOTEMPTY_ATSTART 0x00000008u /* ) adjacent to each other. */
|
||||
#define PCRE2_PARTIAL_SOFT 0x00000010u
|
||||
#define PCRE2_PARTIAL_HARD 0x00000020u
|
||||
#define PCRE2_DFA_RESTART 0x00000040u /* pcre2_dfa_match() only */
|
||||
#define PCRE2_DFA_SHORTEST 0x00000080u /* pcre2_dfa_match() only */
|
||||
#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */
|
||||
#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */
|
||||
#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u
|
||||
#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */
|
||||
|
||||
/* Options for pcre2_pattern_convert(). */
|
||||
|
||||
#define PCRE2_CONVERT_UTF 0x00000001u
|
||||
#define PCRE2_CONVERT_NO_UTF_CHECK 0x00000002u
|
||||
#define PCRE2_CONVERT_POSIX_BASIC 0x00000004u
|
||||
#define PCRE2_CONVERT_POSIX_EXTENDED 0x00000008u
|
||||
#define PCRE2_CONVERT_GLOB 0x00000010u
|
||||
#define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u
|
||||
#define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u
|
||||
|
||||
/* Newline and \R settings, for use in compile contexts. The newline values
|
||||
must be kept in step with values set in config.h and both sets must all be
|
||||
greater than zero. */
|
||||
|
||||
#define PCRE2_NEWLINE_CR 1
|
||||
#define PCRE2_NEWLINE_LF 2
|
||||
#define PCRE2_NEWLINE_CRLF 3
|
||||
#define PCRE2_NEWLINE_ANY 4
|
||||
#define PCRE2_NEWLINE_ANYCRLF 5
|
||||
#define PCRE2_NEWLINE_NUL 6
|
||||
|
||||
#define PCRE2_BSR_UNICODE 1
|
||||
#define PCRE2_BSR_ANYCRLF 2
|
||||
|
||||
/* Error codes for pcre2_compile(). Some of these are also used by
|
||||
pcre2_pattern_convert(). */
|
||||
|
||||
#define PCRE2_ERROR_END_BACKSLASH 101
|
||||
#define PCRE2_ERROR_END_BACKSLASH_C 102
|
||||
#define PCRE2_ERROR_UNKNOWN_ESCAPE 103
|
||||
#define PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER 104
|
||||
#define PCRE2_ERROR_QUANTIFIER_TOO_BIG 105
|
||||
#define PCRE2_ERROR_MISSING_SQUARE_BRACKET 106
|
||||
#define PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS 107
|
||||
#define PCRE2_ERROR_CLASS_RANGE_ORDER 108
|
||||
#define PCRE2_ERROR_QUANTIFIER_INVALID 109
|
||||
#define PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT 110
|
||||
#define PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY 111
|
||||
#define PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS 112
|
||||
#define PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING 113
|
||||
#define PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS 114
|
||||
#define PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE 115
|
||||
#define PCRE2_ERROR_NULL_PATTERN 116
|
||||
#define PCRE2_ERROR_BAD_OPTIONS 117
|
||||
#define PCRE2_ERROR_MISSING_COMMENT_CLOSING 118
|
||||
#define PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP 119
|
||||
#define PCRE2_ERROR_PATTERN_TOO_LARGE 120
|
||||
#define PCRE2_ERROR_HEAP_FAILED 121
|
||||
#define PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS 122
|
||||
#define PCRE2_ERROR_INTERNAL_CODE_OVERFLOW 123
|
||||
#define PCRE2_ERROR_MISSING_CONDITION_CLOSING 124
|
||||
#define PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH 125
|
||||
#define PCRE2_ERROR_ZERO_RELATIVE_REFERENCE 126
|
||||
#define PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES 127
|
||||
#define PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED 128
|
||||
#define PCRE2_ERROR_BAD_RELATIVE_REFERENCE 129
|
||||
#define PCRE2_ERROR_UNKNOWN_POSIX_CLASS 130
|
||||
#define PCRE2_ERROR_INTERNAL_STUDY_ERROR 131
|
||||
#define PCRE2_ERROR_UNICODE_NOT_SUPPORTED 132
|
||||
#define PCRE2_ERROR_PARENTHESES_STACK_CHECK 133
|
||||
#define PCRE2_ERROR_CODE_POINT_TOO_BIG 134
|
||||
#define PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED 135
|
||||
#define PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C 136
|
||||
#define PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE 137
|
||||
#define PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG 138
|
||||
#define PCRE2_ERROR_MISSING_CALLOUT_CLOSING 139
|
||||
#define PCRE2_ERROR_ESCAPE_INVALID_IN_VERB 140
|
||||
#define PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P 141
|
||||
#define PCRE2_ERROR_MISSING_NAME_TERMINATOR 142
|
||||
#define PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME 143
|
||||
#define PCRE2_ERROR_INVALID_SUBPATTERN_NAME 144
|
||||
#define PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE 145
|
||||
#define PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY 146
|
||||
#define PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY 147
|
||||
#define PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG 148
|
||||
#define PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS 149
|
||||
#define PCRE2_ERROR_CLASS_INVALID_RANGE 150
|
||||
#define PCRE2_ERROR_OCTAL_BYTE_TOO_BIG 151
|
||||
#define PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE 152
|
||||
#define PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN 153
|
||||
#define PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES 154
|
||||
#define PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE 155
|
||||
#define PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE 156
|
||||
#define PCRE2_ERROR_BACKSLASH_G_SYNTAX 157
|
||||
#define PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING 158
|
||||
/* Error 159 is obsolete and should now never occur */
|
||||
#define PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED 159
|
||||
#define PCRE2_ERROR_VERB_UNKNOWN 160
|
||||
#define PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG 161
|
||||
#define PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED 162
|
||||
#define PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW 163
|
||||
#define PCRE2_ERROR_INVALID_OCTAL 164
|
||||
#define PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH 165
|
||||
#define PCRE2_ERROR_MARK_MISSING_ARGUMENT 166
|
||||
#define PCRE2_ERROR_INVALID_HEXADECIMAL 167
|
||||
#define PCRE2_ERROR_BACKSLASH_C_SYNTAX 168
|
||||
#define PCRE2_ERROR_BACKSLASH_K_SYNTAX 169
|
||||
#define PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS 170
|
||||
#define PCRE2_ERROR_BACKSLASH_N_IN_CLASS 171
|
||||
#define PCRE2_ERROR_CALLOUT_STRING_TOO_LONG 172
|
||||
#define PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT 173
|
||||
#define PCRE2_ERROR_UTF_IS_DISABLED 174
|
||||
#define PCRE2_ERROR_UCP_IS_DISABLED 175
|
||||
#define PCRE2_ERROR_VERB_NAME_TOO_LONG 176
|
||||
#define PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG 177
|
||||
#define PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS 178
|
||||
#define PCRE2_ERROR_VERSION_CONDITION_SYNTAX 179
|
||||
#define PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS 180
|
||||
#define PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER 181
|
||||
#define PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER 182
|
||||
#define PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED 183
|
||||
#define PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP 184
|
||||
#define PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED 185
|
||||
#define PCRE2_ERROR_PATTERN_TOO_COMPLICATED 186
|
||||
#define PCRE2_ERROR_LOOKBEHIND_TOO_LONG 187
|
||||
#define PCRE2_ERROR_PATTERN_STRING_TOO_LONG 188
|
||||
#define PCRE2_ERROR_INTERNAL_BAD_CODE 189
|
||||
#define PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP 190
|
||||
#define PCRE2_ERROR_NO_SURROGATES_IN_UTF16 191
|
||||
#define PCRE2_ERROR_BAD_LITERAL_OPTIONS 192
|
||||
#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193
|
||||
#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194
|
||||
#define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN 195
|
||||
#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196
|
||||
#define PCRE2_ERROR_TOO_MANY_CAPTURES 197
|
||||
#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198
|
||||
#define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199
|
||||
|
||||
|
||||
/* "Expected" matching error codes: no match and partial match. */
|
||||
|
||||
#define PCRE2_ERROR_NOMATCH (-1)
|
||||
#define PCRE2_ERROR_PARTIAL (-2)
|
||||
|
||||
/* Error codes for UTF-8 validity checks */
|
||||
|
||||
#define PCRE2_ERROR_UTF8_ERR1 (-3)
|
||||
#define PCRE2_ERROR_UTF8_ERR2 (-4)
|
||||
#define PCRE2_ERROR_UTF8_ERR3 (-5)
|
||||
#define PCRE2_ERROR_UTF8_ERR4 (-6)
|
||||
#define PCRE2_ERROR_UTF8_ERR5 (-7)
|
||||
#define PCRE2_ERROR_UTF8_ERR6 (-8)
|
||||
#define PCRE2_ERROR_UTF8_ERR7 (-9)
|
||||
#define PCRE2_ERROR_UTF8_ERR8 (-10)
|
||||
#define PCRE2_ERROR_UTF8_ERR9 (-11)
|
||||
#define PCRE2_ERROR_UTF8_ERR10 (-12)
|
||||
#define PCRE2_ERROR_UTF8_ERR11 (-13)
|
||||
#define PCRE2_ERROR_UTF8_ERR12 (-14)
|
||||
#define PCRE2_ERROR_UTF8_ERR13 (-15)
|
||||
#define PCRE2_ERROR_UTF8_ERR14 (-16)
|
||||
#define PCRE2_ERROR_UTF8_ERR15 (-17)
|
||||
#define PCRE2_ERROR_UTF8_ERR16 (-18)
|
||||
#define PCRE2_ERROR_UTF8_ERR17 (-19)
|
||||
#define PCRE2_ERROR_UTF8_ERR18 (-20)
|
||||
#define PCRE2_ERROR_UTF8_ERR19 (-21)
|
||||
#define PCRE2_ERROR_UTF8_ERR20 (-22)
|
||||
#define PCRE2_ERROR_UTF8_ERR21 (-23)
|
||||
|
||||
/* Error codes for UTF-16 validity checks */
|
||||
|
||||
#define PCRE2_ERROR_UTF16_ERR1 (-24)
|
||||
#define PCRE2_ERROR_UTF16_ERR2 (-25)
|
||||
#define PCRE2_ERROR_UTF16_ERR3 (-26)
|
||||
|
||||
/* Error codes for UTF-32 validity checks */
|
||||
|
||||
#define PCRE2_ERROR_UTF32_ERR1 (-27)
|
||||
#define PCRE2_ERROR_UTF32_ERR2 (-28)
|
||||
|
||||
/* Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction
|
||||
functions, context functions, and serializing functions. They are in numerical
|
||||
order. Originally they were in alphabetical order too, but now that PCRE2 is
|
||||
released, the numbers must not be changed. */
|
||||
|
||||
#define PCRE2_ERROR_BADDATA (-29)
|
||||
#define PCRE2_ERROR_MIXEDTABLES (-30) /* Name was changed */
|
||||
#define PCRE2_ERROR_BADMAGIC (-31)
|
||||
#define PCRE2_ERROR_BADMODE (-32)
|
||||
#define PCRE2_ERROR_BADOFFSET (-33)
|
||||
#define PCRE2_ERROR_BADOPTION (-34)
|
||||
#define PCRE2_ERROR_BADREPLACEMENT (-35)
|
||||
#define PCRE2_ERROR_BADUTFOFFSET (-36)
|
||||
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
|
||||
#define PCRE2_ERROR_DFA_BADRESTART (-38)
|
||||
#define PCRE2_ERROR_DFA_RECURSE (-39)
|
||||
#define PCRE2_ERROR_DFA_UCOND (-40)
|
||||
#define PCRE2_ERROR_DFA_UFUNC (-41)
|
||||
#define PCRE2_ERROR_DFA_UITEM (-42)
|
||||
#define PCRE2_ERROR_DFA_WSSIZE (-43)
|
||||
#define PCRE2_ERROR_INTERNAL (-44)
|
||||
#define PCRE2_ERROR_JIT_BADOPTION (-45)
|
||||
#define PCRE2_ERROR_JIT_STACKLIMIT (-46)
|
||||
#define PCRE2_ERROR_MATCHLIMIT (-47)
|
||||
#define PCRE2_ERROR_NOMEMORY (-48)
|
||||
#define PCRE2_ERROR_NOSUBSTRING (-49)
|
||||
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-50)
|
||||
#define PCRE2_ERROR_NULL (-51)
|
||||
#define PCRE2_ERROR_RECURSELOOP (-52)
|
||||
#define PCRE2_ERROR_DEPTHLIMIT (-53)
|
||||
#define PCRE2_ERROR_RECURSIONLIMIT (-53) /* Obsolete synonym */
|
||||
#define PCRE2_ERROR_UNAVAILABLE (-54)
|
||||
#define PCRE2_ERROR_UNSET (-55)
|
||||
#define PCRE2_ERROR_BADOFFSETLIMIT (-56)
|
||||
#define PCRE2_ERROR_BADREPESCAPE (-57)
|
||||
#define PCRE2_ERROR_REPMISSINGBRACE (-58)
|
||||
#define PCRE2_ERROR_BADSUBSTITUTION (-59)
|
||||
#define PCRE2_ERROR_BADSUBSPATTERN (-60)
|
||||
#define PCRE2_ERROR_TOOMANYREPLACE (-61)
|
||||
#define PCRE2_ERROR_BADSERIALIZEDDATA (-62)
|
||||
#define PCRE2_ERROR_HEAPLIMIT (-63)
|
||||
#define PCRE2_ERROR_CONVERT_SYNTAX (-64)
|
||||
#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65)
|
||||
#define PCRE2_ERROR_DFA_UINVALID_UTF (-66)
|
||||
|
||||
|
||||
/* Request types for pcre2_pattern_info() */
|
||||
|
||||
#define PCRE2_INFO_ALLOPTIONS 0
|
||||
#define PCRE2_INFO_ARGOPTIONS 1
|
||||
#define PCRE2_INFO_BACKREFMAX 2
|
||||
#define PCRE2_INFO_BSR 3
|
||||
#define PCRE2_INFO_CAPTURECOUNT 4
|
||||
#define PCRE2_INFO_FIRSTCODEUNIT 5
|
||||
#define PCRE2_INFO_FIRSTCODETYPE 6
|
||||
#define PCRE2_INFO_FIRSTBITMAP 7
|
||||
#define PCRE2_INFO_HASCRORLF 8
|
||||
#define PCRE2_INFO_JCHANGED 9
|
||||
#define PCRE2_INFO_JITSIZE 10
|
||||
#define PCRE2_INFO_LASTCODEUNIT 11
|
||||
#define PCRE2_INFO_LASTCODETYPE 12
|
||||
#define PCRE2_INFO_MATCHEMPTY 13
|
||||
#define PCRE2_INFO_MATCHLIMIT 14
|
||||
#define PCRE2_INFO_MAXLOOKBEHIND 15
|
||||
#define PCRE2_INFO_MINLENGTH 16
|
||||
#define PCRE2_INFO_NAMECOUNT 17
|
||||
#define PCRE2_INFO_NAMEENTRYSIZE 18
|
||||
#define PCRE2_INFO_NAMETABLE 19
|
||||
#define PCRE2_INFO_NEWLINE 20
|
||||
#define PCRE2_INFO_DEPTHLIMIT 21
|
||||
#define PCRE2_INFO_RECURSIONLIMIT 21 /* Obsolete synonym */
|
||||
#define PCRE2_INFO_SIZE 22
|
||||
#define PCRE2_INFO_HASBACKSLASHC 23
|
||||
#define PCRE2_INFO_FRAMESIZE 24
|
||||
#define PCRE2_INFO_HEAPLIMIT 25
|
||||
#define PCRE2_INFO_EXTRAOPTIONS 26
|
||||
|
||||
/* Request types for pcre2_config(). */
|
||||
|
||||
#define PCRE2_CONFIG_BSR 0
|
||||
#define PCRE2_CONFIG_JIT 1
|
||||
#define PCRE2_CONFIG_JITTARGET 2
|
||||
#define PCRE2_CONFIG_LINKSIZE 3
|
||||
#define PCRE2_CONFIG_MATCHLIMIT 4
|
||||
#define PCRE2_CONFIG_NEWLINE 5
|
||||
#define PCRE2_CONFIG_PARENSLIMIT 6
|
||||
#define PCRE2_CONFIG_DEPTHLIMIT 7
|
||||
#define PCRE2_CONFIG_RECURSIONLIMIT 7 /* Obsolete synonym */
|
||||
#define PCRE2_CONFIG_STACKRECURSE 8 /* Obsolete */
|
||||
#define PCRE2_CONFIG_UNICODE 9
|
||||
#define PCRE2_CONFIG_UNICODE_VERSION 10
|
||||
#define PCRE2_CONFIG_VERSION 11
|
||||
#define PCRE2_CONFIG_HEAPLIMIT 12
|
||||
#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13
|
||||
#define PCRE2_CONFIG_COMPILED_WIDTHS 14
|
||||
#define PCRE2_CONFIG_TABLES_LENGTH 15
|
||||
|
||||
|
||||
/* Types for code units in patterns and subject strings. */
|
||||
|
||||
typedef uint8_t PCRE2_UCHAR8;
|
||||
typedef uint16_t PCRE2_UCHAR16;
|
||||
typedef uint32_t PCRE2_UCHAR32;
|
||||
|
||||
typedef const PCRE2_UCHAR8 *PCRE2_SPTR8;
|
||||
typedef const PCRE2_UCHAR16 *PCRE2_SPTR16;
|
||||
typedef const PCRE2_UCHAR32 *PCRE2_SPTR32;
|
||||
|
||||
/* The PCRE2_SIZE type is used for all string lengths and offsets in PCRE2,
|
||||
including pattern offsets for errors and subject offsets after a match. We
|
||||
define special values to indicate zero-terminated strings and unset offsets in
|
||||
the offset vector (ovector). */
|
||||
|
||||
#define PCRE2_SIZE size_t
|
||||
#define PCRE2_SIZE_MAX SIZE_MAX
|
||||
#define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0)
|
||||
#define PCRE2_UNSET (~(PCRE2_SIZE)0)
|
||||
|
||||
/* Generic types for opaque structures and JIT callback functions. These
|
||||
declarations are defined in a macro that is expanded for each width later. */
|
||||
|
||||
#define PCRE2_TYPES_LIST \
|
||||
struct pcre2_real_general_context; \
|
||||
typedef struct pcre2_real_general_context pcre2_general_context; \
|
||||
\
|
||||
struct pcre2_real_compile_context; \
|
||||
typedef struct pcre2_real_compile_context pcre2_compile_context; \
|
||||
\
|
||||
struct pcre2_real_match_context; \
|
||||
typedef struct pcre2_real_match_context pcre2_match_context; \
|
||||
\
|
||||
struct pcre2_real_convert_context; \
|
||||
typedef struct pcre2_real_convert_context pcre2_convert_context; \
|
||||
\
|
||||
struct pcre2_real_code; \
|
||||
typedef struct pcre2_real_code pcre2_code; \
|
||||
\
|
||||
struct pcre2_real_match_data; \
|
||||
typedef struct pcre2_real_match_data pcre2_match_data; \
|
||||
\
|
||||
struct pcre2_real_jit_stack; \
|
||||
typedef struct pcre2_real_jit_stack pcre2_jit_stack; \
|
||||
\
|
||||
typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *);
|
||||
|
||||
|
||||
/* The structures for passing out data via callout functions. We use structures
|
||||
so that new fields can be added on the end in future versions, without changing
|
||||
the API of the function, thereby allowing old clients to work without
|
||||
modification. Define the generic versions in a macro; the width-specific
|
||||
versions are generated from this macro below. */
|
||||
|
||||
/* Flags for the callout_flags field. These are cleared after a callout. */
|
||||
|
||||
#define PCRE2_CALLOUT_STARTMATCH 0x00000001u /* Set for each bumpalong */
|
||||
#define PCRE2_CALLOUT_BACKTRACK 0x00000002u /* Set after a backtrack */
|
||||
|
||||
#define PCRE2_STRUCTURE_LIST \
|
||||
typedef struct pcre2_callout_block { \
|
||||
uint32_t version; /* Identifies version of block */ \
|
||||
/* ------------------------ Version 0 ------------------------------- */ \
|
||||
uint32_t callout_number; /* Number compiled into pattern */ \
|
||||
uint32_t capture_top; /* Max current capture */ \
|
||||
uint32_t capture_last; /* Most recently closed capture */ \
|
||||
PCRE2_SIZE *offset_vector; /* The offset vector */ \
|
||||
PCRE2_SPTR mark; /* Pointer to current mark or NULL */ \
|
||||
PCRE2_SPTR subject; /* The subject being matched */ \
|
||||
PCRE2_SIZE subject_length; /* The length of the subject */ \
|
||||
PCRE2_SIZE start_match; /* Offset to start of this match attempt */ \
|
||||
PCRE2_SIZE current_position; /* Where we currently are in the subject */ \
|
||||
PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \
|
||||
PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \
|
||||
/* ------------------- Added for Version 1 -------------------------- */ \
|
||||
PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \
|
||||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||
/* ------------------- Added for Version 2 -------------------------- */ \
|
||||
uint32_t callout_flags; /* See above for list */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_callout_block; \
|
||||
\
|
||||
typedef struct pcre2_callout_enumerate_block { \
|
||||
uint32_t version; /* Identifies version of block */ \
|
||||
/* ------------------------ Version 0 ------------------------------- */ \
|
||||
PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \
|
||||
PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \
|
||||
uint32_t callout_number; /* Number compiled into pattern */ \
|
||||
PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \
|
||||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_callout_enumerate_block; \
|
||||
\
|
||||
typedef struct pcre2_substitute_callout_block { \
|
||||
uint32_t version; /* Identifies version of block */ \
|
||||
/* ------------------------ Version 0 ------------------------------- */ \
|
||||
PCRE2_SPTR input; /* Pointer to input subject string */ \
|
||||
PCRE2_SPTR output; /* Pointer to output buffer */ \
|
||||
PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \
|
||||
PCRE2_SIZE *ovector; /* Pointer to current ovector */ \
|
||||
uint32_t oveccount; /* Count of pairs set in ovector */ \
|
||||
uint32_t subscount; /* Substitution number */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_substitute_callout_block;
|
||||
|
||||
|
||||
/* List the generic forms of all other functions in macros, which will be
|
||||
expanded for each width below. Start with functions that give general
|
||||
information. */
|
||||
|
||||
#define PCRE2_GENERAL_INFO_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *);
|
||||
|
||||
|
||||
/* Functions for manipulating contexts. */
|
||||
|
||||
#define PCRE2_GENERAL_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_copy(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
|
||||
void (*)(void *, void *), void *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_free(pcre2_general_context *);
|
||||
|
||||
#define PCRE2_COMPILE_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_compile_context_copy(pcre2_compile_context *); \
|
||||
PCRE2_EXP_DECL pcre2_compile_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_compile_context_create(pcre2_general_context *);\
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_compile_context_free(pcre2_compile_context *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_bsr(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_character_tables(pcre2_compile_context *, const uint8_t *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_newline(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_recursion_guard(pcre2_compile_context *, \
|
||||
int (*)(uint32_t, void *), void *);
|
||||
|
||||
#define PCRE2_MATCH_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_context_copy(pcre2_match_context *); \
|
||||
PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_context_create(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_context_free(pcre2_match_context *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_callout(pcre2_match_context *, \
|
||||
int (*)(pcre2_callout_block *, void *), void *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_substitute_callout(pcre2_match_context *, \
|
||||
int (*)(pcre2_substitute_callout_block *, void *), void *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_match_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_recursion_memory_management(pcre2_match_context *, \
|
||||
void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *);
|
||||
|
||||
#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_convert_context_copy(pcre2_convert_context *); \
|
||||
PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_convert_context_create(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_convert_context_free(pcre2_convert_context *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_glob_escape(pcre2_convert_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_glob_separator(pcre2_convert_context *, uint32_t);
|
||||
|
||||
|
||||
/* Functions concerned with compiling a pattern to PCRE internal code. */
|
||||
|
||||
#define PCRE2_COMPILE_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
|
||||
pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \
|
||||
pcre2_compile_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_code_free(pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
|
||||
pcre2_code_copy(const pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code *PCRE2_CALL_CONVENTION \
|
||||
pcre2_code_copy_with_tables(const pcre2_code *);
|
||||
|
||||
|
||||
/* Functions that give information about a compiled pattern. */
|
||||
|
||||
#define PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_callout_enumerate(const pcre2_code *, \
|
||||
int (*)(pcre2_callout_enumerate_block *, void *), void *);
|
||||
|
||||
|
||||
/* Functions for running a match and inspecting the result. */
|
||||
|
||||
#define PCRE2_MATCH_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_data_create(uint32_t, pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *, \
|
||||
pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
|
||||
uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
|
||||
uint32_t, pcre2_match_data *, pcre2_match_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_match_data_free(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_mark(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_match_data_size(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_ovector_count(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_ovector_pointer(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_startchar(pcre2_match_data *);
|
||||
|
||||
|
||||
/* Convenience functions for handling matched substrings. */
|
||||
|
||||
#define PCRE2_SUBSTRING_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \
|
||||
PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \
|
||||
PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_free(PCRE2_UCHAR *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \
|
||||
PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \
|
||||
PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \
|
||||
PCRE2_SPTR *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_list_free(PCRE2_SPTR *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **);
|
||||
|
||||
/* Functions for serializing / deserializing compiled patterns. */
|
||||
|
||||
#define PCRE2_SERIALIZE_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \
|
||||
pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \
|
||||
PCRE2_SIZE *, pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \
|
||||
pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \
|
||||
pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \
|
||||
pcre2_serialize_get_number_of_codes(const uint8_t *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_serialize_free(uint8_t *);
|
||||
|
||||
|
||||
/* Convenience function for match + substitute. */
|
||||
|
||||
#define PCRE2_SUBSTITUTE_FUNCTION \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
|
||||
uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \
|
||||
PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *);
|
||||
|
||||
|
||||
/* Functions for converting pattern source strings. */
|
||||
|
||||
#define PCRE2_CONVERT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_pattern_convert(PCRE2_SPTR, PCRE2_SIZE, uint32_t, PCRE2_UCHAR **, \
|
||||
PCRE2_SIZE *, pcre2_convert_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_converted_pattern_free(PCRE2_UCHAR *);
|
||||
|
||||
|
||||
/* Functions for JIT processing */
|
||||
|
||||
#define PCRE2_JIT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_compile(pcre2_code *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
|
||||
uint32_t, pcre2_match_data *, pcre2_match_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_free_unused_memory(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_stack_free(pcre2_jit_stack *);
|
||||
|
||||
|
||||
/* Other miscellaneous functions. */
|
||||
|
||||
#define PCRE2_OTHER_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL const uint8_t *PCRE2_CALL_CONVENTION \
|
||||
pcre2_maketables(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_maketables_free(pcre2_general_context *, const uint8_t *);
|
||||
|
||||
/* Define macros that generate width-specific names from generic versions. The
|
||||
three-level macro scheme is necessary to get the macros expanded when we want
|
||||
them to be. First we get the width from PCRE2_LOCAL_WIDTH, which is used for
|
||||
generating three versions of everything below. After that, PCRE2_SUFFIX will be
|
||||
re-defined to use PCRE2_CODE_UNIT_WIDTH, for use when macros such as
|
||||
pcre2_compile are called by application code. */
|
||||
|
||||
#define PCRE2_JOIN(a,b) a ## b
|
||||
#define PCRE2_GLUE(a,b) PCRE2_JOIN(a,b)
|
||||
#define PCRE2_SUFFIX(a) PCRE2_GLUE(a,PCRE2_LOCAL_WIDTH)
|
||||
|
||||
|
||||
/* Data types */
|
||||
|
||||
#define PCRE2_UCHAR PCRE2_SUFFIX(PCRE2_UCHAR)
|
||||
#define PCRE2_SPTR PCRE2_SUFFIX(PCRE2_SPTR)
|
||||
|
||||
#define pcre2_code PCRE2_SUFFIX(pcre2_code_)
|
||||
#define pcre2_jit_callback PCRE2_SUFFIX(pcre2_jit_callback_)
|
||||
#define pcre2_jit_stack PCRE2_SUFFIX(pcre2_jit_stack_)
|
||||
|
||||
#define pcre2_real_code PCRE2_SUFFIX(pcre2_real_code_)
|
||||
#define pcre2_real_general_context PCRE2_SUFFIX(pcre2_real_general_context_)
|
||||
#define pcre2_real_compile_context PCRE2_SUFFIX(pcre2_real_compile_context_)
|
||||
#define pcre2_real_convert_context PCRE2_SUFFIX(pcre2_real_convert_context_)
|
||||
#define pcre2_real_match_context PCRE2_SUFFIX(pcre2_real_match_context_)
|
||||
#define pcre2_real_jit_stack PCRE2_SUFFIX(pcre2_real_jit_stack_)
|
||||
#define pcre2_real_match_data PCRE2_SUFFIX(pcre2_real_match_data_)
|
||||
|
||||
|
||||
/* Data blocks */
|
||||
|
||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
||||
#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_)
|
||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||
#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_)
|
||||
#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_)
|
||||
#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_)
|
||||
|
||||
|
||||
/* Functions: the complete list in alphabetical order */
|
||||
|
||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
||||
#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_)
|
||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||
#define pcre2_compile_context_create PCRE2_SUFFIX(pcre2_compile_context_create_)
|
||||
#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_)
|
||||
#define pcre2_config PCRE2_SUFFIX(pcre2_config_)
|
||||
#define pcre2_convert_context_copy PCRE2_SUFFIX(pcre2_convert_context_copy_)
|
||||
#define pcre2_convert_context_create PCRE2_SUFFIX(pcre2_convert_context_create_)
|
||||
#define pcre2_convert_context_free PCRE2_SUFFIX(pcre2_convert_context_free_)
|
||||
#define pcre2_converted_pattern_free PCRE2_SUFFIX(pcre2_converted_pattern_free_)
|
||||
#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_)
|
||||
#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_)
|
||||
#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_)
|
||||
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
|
||||
#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_)
|
||||
#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_)
|
||||
#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_)
|
||||
#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_)
|
||||
#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_)
|
||||
#define pcre2_get_startchar PCRE2_SUFFIX(pcre2_get_startchar_)
|
||||
#define pcre2_jit_compile PCRE2_SUFFIX(pcre2_jit_compile_)
|
||||
#define pcre2_jit_match PCRE2_SUFFIX(pcre2_jit_match_)
|
||||
#define pcre2_jit_free_unused_memory PCRE2_SUFFIX(pcre2_jit_free_unused_memory_)
|
||||
#define pcre2_jit_stack_assign PCRE2_SUFFIX(pcre2_jit_stack_assign_)
|
||||
#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_)
|
||||
#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_)
|
||||
#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_)
|
||||
#define pcre2_maketables_free PCRE2_SUFFIX(pcre2_maketables_free_)
|
||||
#define pcre2_match PCRE2_SUFFIX(pcre2_match_)
|
||||
#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_)
|
||||
#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_)
|
||||
#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_)
|
||||
#define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_)
|
||||
#define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_)
|
||||
#define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_)
|
||||
#define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_)
|
||||
#define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_)
|
||||
#define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_)
|
||||
#define pcre2_serialize_encode PCRE2_SUFFIX(pcre2_serialize_encode_)
|
||||
#define pcre2_serialize_free PCRE2_SUFFIX(pcre2_serialize_free_)
|
||||
#define pcre2_serialize_get_number_of_codes PCRE2_SUFFIX(pcre2_serialize_get_number_of_codes_)
|
||||
#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_)
|
||||
#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_)
|
||||
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
|
||||
#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_)
|
||||
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
|
||||
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
|
||||
#define pcre2_set_glob_escape PCRE2_SUFFIX(pcre2_set_glob_escape_)
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_)
|
||||
#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_)
|
||||
#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_)
|
||||
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
|
||||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
|
||||
#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_)
|
||||
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
||||
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
||||
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
||||
#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_)
|
||||
#define pcre2_substring_get_byname PCRE2_SUFFIX(pcre2_substring_get_byname_)
|
||||
#define pcre2_substring_get_bynumber PCRE2_SUFFIX(pcre2_substring_get_bynumber_)
|
||||
#define pcre2_substring_length_byname PCRE2_SUFFIX(pcre2_substring_length_byname_)
|
||||
#define pcre2_substring_length_bynumber PCRE2_SUFFIX(pcre2_substring_length_bynumber_)
|
||||
#define pcre2_substring_list_get PCRE2_SUFFIX(pcre2_substring_list_get_)
|
||||
#define pcre2_substring_list_free PCRE2_SUFFIX(pcre2_substring_list_free_)
|
||||
#define pcre2_substring_nametable_scan PCRE2_SUFFIX(pcre2_substring_nametable_scan_)
|
||||
#define pcre2_substring_number_from_name PCRE2_SUFFIX(pcre2_substring_number_from_name_)
|
||||
|
||||
/* Keep this old function name for backwards compatibility */
|
||||
#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_)
|
||||
|
||||
/* Keep this obsolete function for backwards compatibility: it is now a noop. */
|
||||
#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_)
|
||||
|
||||
/* Now generate all three sets of width-specific structures and function
|
||||
prototypes. */
|
||||
|
||||
#define PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS \
|
||||
PCRE2_TYPES_LIST \
|
||||
PCRE2_STRUCTURE_LIST \
|
||||
PCRE2_GENERAL_INFO_FUNCTIONS \
|
||||
PCRE2_GENERAL_CONTEXT_FUNCTIONS \
|
||||
PCRE2_COMPILE_CONTEXT_FUNCTIONS \
|
||||
PCRE2_CONVERT_CONTEXT_FUNCTIONS \
|
||||
PCRE2_CONVERT_FUNCTIONS \
|
||||
PCRE2_MATCH_CONTEXT_FUNCTIONS \
|
||||
PCRE2_COMPILE_FUNCTIONS \
|
||||
PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||
PCRE2_MATCH_FUNCTIONS \
|
||||
PCRE2_SUBSTRING_FUNCTIONS \
|
||||
PCRE2_SERIALIZE_FUNCTIONS \
|
||||
PCRE2_SUBSTITUTE_FUNCTION \
|
||||
PCRE2_JIT_FUNCTIONS \
|
||||
PCRE2_OTHER_FUNCTIONS
|
||||
|
||||
#define PCRE2_LOCAL_WIDTH 8
|
||||
PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
|
||||
#undef PCRE2_LOCAL_WIDTH
|
||||
|
||||
#define PCRE2_LOCAL_WIDTH 16
|
||||
PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
|
||||
#undef PCRE2_LOCAL_WIDTH
|
||||
|
||||
#define PCRE2_LOCAL_WIDTH 32
|
||||
PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
|
||||
#undef PCRE2_LOCAL_WIDTH
|
||||
|
||||
/* Undefine the list macros; they are no longer needed. */
|
||||
|
||||
#undef PCRE2_TYPES_LIST
|
||||
#undef PCRE2_STRUCTURE_LIST
|
||||
#undef PCRE2_GENERAL_INFO_FUNCTIONS
|
||||
#undef PCRE2_GENERAL_CONTEXT_FUNCTIONS
|
||||
#undef PCRE2_COMPILE_CONTEXT_FUNCTIONS
|
||||
#undef PCRE2_CONVERT_CONTEXT_FUNCTIONS
|
||||
#undef PCRE2_MATCH_CONTEXT_FUNCTIONS
|
||||
#undef PCRE2_COMPILE_FUNCTIONS
|
||||
#undef PCRE2_PATTERN_INFO_FUNCTIONS
|
||||
#undef PCRE2_MATCH_FUNCTIONS
|
||||
#undef PCRE2_SUBSTRING_FUNCTIONS
|
||||
#undef PCRE2_SERIALIZE_FUNCTIONS
|
||||
#undef PCRE2_SUBSTITUTE_FUNCTION
|
||||
#undef PCRE2_JIT_FUNCTIONS
|
||||
#undef PCRE2_OTHER_FUNCTIONS
|
||||
#undef PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
|
||||
|
||||
/* PCRE2_CODE_UNIT_WIDTH must be defined. If it is 8, 16, or 32, redefine
|
||||
PCRE2_SUFFIX to use it. If it is 0, undefine the other macros and make
|
||||
PCRE2_SUFFIX a no-op. Otherwise, generate an error. */
|
||||
|
||||
#undef PCRE2_SUFFIX
|
||||
#ifndef PCRE2_CODE_UNIT_WIDTH
|
||||
#error PCRE2_CODE_UNIT_WIDTH must be defined before including pcre2.h.
|
||||
#error Use 8, 16, or 32; or 0 for a multi-width application.
|
||||
#else /* PCRE2_CODE_UNIT_WIDTH is defined */
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8 || \
|
||||
PCRE2_CODE_UNIT_WIDTH == 16 || \
|
||||
PCRE2_CODE_UNIT_WIDTH == 32
|
||||
#define PCRE2_SUFFIX(a) PCRE2_GLUE(a, PCRE2_CODE_UNIT_WIDTH)
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 0
|
||||
#undef PCRE2_JOIN
|
||||
#undef PCRE2_GLUE
|
||||
#define PCRE2_SUFFIX(a) a
|
||||
#else
|
||||
#error PCRE2_CODE_UNIT_WIDTH must be 0, 8, 16, or 32.
|
||||
#endif
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH is defined */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* PCRE2_H_IDEMPOTENT_GUARD */
|
||||
|
||||
/* End of pcre2.h */
|
1365
third_party/pcre/pcre2_auto_possess.c
vendored
Normal file
1365
third_party/pcre/pcre2_auto_possess.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
202
third_party/pcre/pcre2_chartables.c
vendored
Normal file
202
third_party/pcre/pcre2_chartables.c
vendored
Normal file
|
@ -0,0 +1,202 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* This file was automatically written by the pcre2_dftables auxiliary
|
||||
program. It contains character tables that are used when no external
|
||||
tables are passed to PCRE2 by the application that calls it. The tables
|
||||
are used only for characters whose code values are less than 256. */
|
||||
|
||||
/* This set of tables was written in the C locale. */
|
||||
|
||||
/* The pcre2_ftables program (which is distributed with PCRE2) can be used
|
||||
to build alternative versions of this file. This is necessary if you are
|
||||
running in an EBCDIC environment, or if you want to default to a different
|
||||
encoding, for example ISO-8859-1. When pcre2_dftables is run, it creates
|
||||
these tables in the "C" locale by default. This happens automatically if
|
||||
PCRE2 is configured with --enable-rebuild-chartables. However, you can run
|
||||
pcre2_dftables manually with the -L option to build tables using the LC_ALL
|
||||
locale. */
|
||||
|
||||
/* The following #include is present because without it gcc 4.x may remove
|
||||
the array definition from the final binary if PCRE2 is built into a static
|
||||
library and dead code stripping is activated. This leads to link errors.
|
||||
Pulling in the header ensures that the array gets flagged as "someone
|
||||
outside this compilation unit might reference this" and so it will always
|
||||
be supplied to the linker. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
const uint8_t PRIV(default_tables)[] = {
|
||||
|
||||
/* This table is a lower casing table. */
|
||||
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 97, 98, 99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
192,193,194,195,196,197,198,199,
|
||||
200,201,202,203,204,205,206,207,
|
||||
208,209,210,211,212,213,214,215,
|
||||
216,217,218,219,220,221,222,223,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,247,
|
||||
248,249,250,251,252,253,254,255,
|
||||
|
||||
/* This table is a case flipping table. */
|
||||
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 97, 98, 99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122, 91, 92, 93, 94, 95,
|
||||
96, 65, 66, 67, 68, 69, 70, 71,
|
||||
72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87,
|
||||
88, 89, 90,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
192,193,194,195,196,197,198,199,
|
||||
200,201,202,203,204,205,206,207,
|
||||
208,209,210,211,212,213,214,215,
|
||||
216,217,218,219,220,221,222,223,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,247,
|
||||
248,249,250,251,252,253,254,255,
|
||||
|
||||
/* This table contains bit maps for various character classes. Each map is 32
|
||||
bytes long and the bits run from the least significant end of each byte. The
|
||||
classes that have their own maps are: space, xdigit, digit, upper, lower, word,
|
||||
graph, print, punct, and cntrl. Other classes are built from combinations. */
|
||||
|
||||
0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, /* space */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* xdigit */
|
||||
0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* digit */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* upper */
|
||||
0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* lower */
|
||||
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, /* word */
|
||||
0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, /* graph */
|
||||
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, /* print */
|
||||
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, /* punct */
|
||||
0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, /* cntrl */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
/* This table identifies various classes of character by individual bits:
|
||||
0x01 white space character
|
||||
0x02 letter
|
||||
0x04 lower case letter
|
||||
0x08 decimal digit
|
||||
0x10 alphanumeric or '_'
|
||||
*/
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
|
||||
0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
|
||||
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
|
||||
0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, /* 0 - 7 */
|
||||
0x18,0x18,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
|
||||
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* @ - G */
|
||||
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
|
||||
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
|
||||
0x12,0x12,0x12,0x00,0x00,0x00,0x00,0x10, /* X - _ */
|
||||
0x00,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* ` - g */
|
||||
0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* h - o */
|
||||
0x16,0x16,0x16,0x16,0x16,0x16,0x16,0x16, /* p - w */
|
||||
0x16,0x16,0x16,0x00,0x00,0x00,0x00,0x00, /* x -127 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
|
||||
|
||||
/* End of pcre2_chartables.c */
|
10635
third_party/pcre/pcre2_compile.c
vendored
Normal file
10635
third_party/pcre/pcre2_compile.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
252
third_party/pcre/pcre2_config.c
vendored
Normal file
252
third_party/pcre/pcre2_config.c
vendored
Normal file
|
@ -0,0 +1,252 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
/* Save the configured link size, which is in bytes. In 16-bit and 32-bit modes
|
||||
its value gets changed by pcre2_intmodedep.h (included by pcre2_internal.h) to
|
||||
be in code units. */
|
||||
|
||||
static int configured_link_size = LINK_SIZE;
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
/* These macros are the standard way of turning unquoted text into C strings.
|
||||
They allow macros like PCRE2_MAJOR to be defined without quotes, which is
|
||||
convenient for user programs that want to test their values. */
|
||||
|
||||
#define STRING(a) # a
|
||||
#define XSTRING(s) STRING(s)
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Return info about what features are configured *
|
||||
*************************************************/
|
||||
|
||||
/* If where is NULL, the length of memory required is returned.
|
||||
|
||||
Arguments:
|
||||
what what information is required
|
||||
where where to put the information
|
||||
|
||||
Returns: 0 if a numerical value is returned
|
||||
>= 0 if a string value
|
||||
PCRE2_ERROR_BADOPTION if "where" not recognized
|
||||
or JIT target requested when JIT not enabled
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_config(uint32_t what, void *where)
|
||||
{
|
||||
if (where == NULL) /* Requests a length */
|
||||
{
|
||||
switch(what)
|
||||
{
|
||||
default:
|
||||
return PCRE2_ERROR_BADOPTION;
|
||||
|
||||
case PCRE2_CONFIG_BSR:
|
||||
case PCRE2_CONFIG_COMPILED_WIDTHS:
|
||||
case PCRE2_CONFIG_DEPTHLIMIT:
|
||||
case PCRE2_CONFIG_HEAPLIMIT:
|
||||
case PCRE2_CONFIG_JIT:
|
||||
case PCRE2_CONFIG_LINKSIZE:
|
||||
case PCRE2_CONFIG_MATCHLIMIT:
|
||||
case PCRE2_CONFIG_NEVER_BACKSLASH_C:
|
||||
case PCRE2_CONFIG_NEWLINE:
|
||||
case PCRE2_CONFIG_PARENSLIMIT:
|
||||
case PCRE2_CONFIG_STACKRECURSE: /* Obsolete */
|
||||
case PCRE2_CONFIG_TABLES_LENGTH:
|
||||
case PCRE2_CONFIG_UNICODE:
|
||||
return sizeof(uint32_t);
|
||||
|
||||
/* These are handled below */
|
||||
|
||||
case PCRE2_CONFIG_JITTARGET:
|
||||
case PCRE2_CONFIG_UNICODE_VERSION:
|
||||
case PCRE2_CONFIG_VERSION:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (what)
|
||||
{
|
||||
default:
|
||||
return PCRE2_ERROR_BADOPTION;
|
||||
|
||||
case PCRE2_CONFIG_BSR:
|
||||
#ifdef BSR_ANYCRLF
|
||||
*((uint32_t *)where) = PCRE2_BSR_ANYCRLF;
|
||||
#else
|
||||
*((uint32_t *)where) = PCRE2_BSR_UNICODE;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_COMPILED_WIDTHS:
|
||||
*((uint32_t *)where) = 0
|
||||
#ifdef SUPPORT_PCRE2_8
|
||||
+ 1
|
||||
#endif
|
||||
#ifdef SUPPORT_PCRE2_16
|
||||
+ 2
|
||||
#endif
|
||||
#ifdef SUPPORT_PCRE2_32
|
||||
+ 4
|
||||
#endif
|
||||
;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_DEPTHLIMIT:
|
||||
*((uint32_t *)where) = MATCH_LIMIT_DEPTH;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_HEAPLIMIT:
|
||||
*((uint32_t *)where) = HEAP_LIMIT;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_JIT:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((uint32_t *)where) = 1;
|
||||
#else
|
||||
*((uint32_t *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_JITTARGET:
|
||||
#ifdef SUPPORT_JIT
|
||||
{
|
||||
const char *v = PRIV(jit_get_target)();
|
||||
return (int)(1 + ((where == NULL)?
|
||||
strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
|
||||
}
|
||||
#else
|
||||
return PCRE2_ERROR_BADOPTION;
|
||||
#endif
|
||||
|
||||
case PCRE2_CONFIG_LINKSIZE:
|
||||
*((uint32_t *)where) = (uint32_t)configured_link_size;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_MATCHLIMIT:
|
||||
*((uint32_t *)where) = MATCH_LIMIT;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_NEWLINE:
|
||||
*((uint32_t *)where) = NEWLINE_DEFAULT;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_NEVER_BACKSLASH_C:
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
*((uint32_t *)where) = 1;
|
||||
#else
|
||||
*((uint32_t *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_PARENSLIMIT:
|
||||
*((uint32_t *)where) = PARENS_NEST_LIMIT;
|
||||
break;
|
||||
|
||||
/* This is now obsolete. The stack is no longer used via recursion for
|
||||
handling backtracking in pcre2_match(). */
|
||||
|
||||
case PCRE2_CONFIG_STACKRECURSE:
|
||||
*((uint32_t *)where) = 0;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_TABLES_LENGTH:
|
||||
*((uint32_t *)where) = TABLES_LENGTH;
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_UNICODE_VERSION:
|
||||
{
|
||||
#if defined SUPPORT_UNICODE
|
||||
const char *v = PRIV(unicode_version);
|
||||
#else
|
||||
const char *v = "Unicode not supported";
|
||||
#endif
|
||||
return (int)(1 + ((where == NULL)?
|
||||
strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
|
||||
}
|
||||
break;
|
||||
|
||||
case PCRE2_CONFIG_UNICODE:
|
||||
#if defined SUPPORT_UNICODE
|
||||
*((uint32_t *)where) = 1;
|
||||
#else
|
||||
*((uint32_t *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
/* The hackery in setting "v" below is to cope with the case when
|
||||
PCRE2_PRERELEASE is set to an empty string (which it is for real releases).
|
||||
If the second alternative is used in this case, it does not leave a space
|
||||
before the date. On the other hand, if all four macros are put into a single
|
||||
XSTRING when PCRE2_PRERELEASE is not empty, an unwanted space is inserted.
|
||||
There are problems using an "obvious" approach like this:
|
||||
|
||||
XSTRING(PCRE2_MAJOR) "." XSTRING(PCRE_MINOR)
|
||||
XSTRING(PCRE2_PRERELEASE) " " XSTRING(PCRE_DATE)
|
||||
|
||||
because, when PCRE2_PRERELEASE is empty, this leads to an attempted expansion
|
||||
of STRING(). The C standard states: "If (before argument substitution) any
|
||||
argument consists of no preprocessing tokens, the behavior is undefined." It
|
||||
turns out the gcc treats this case as a single empty string - which is what
|
||||
we really want - but Visual C grumbles about the lack of an argument for the
|
||||
macro. Unfortunately, both are within their rights. As there seems to be no
|
||||
way to test for a macro's value being empty at compile time, we have to
|
||||
resort to a runtime test. */
|
||||
|
||||
case PCRE2_CONFIG_VERSION:
|
||||
{
|
||||
const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)?
|
||||
XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) :
|
||||
XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE);
|
||||
return (int)(1 + ((where == NULL)?
|
||||
strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End of pcre2_config.c */
|
494
third_party/pcre/pcre2_context.c
vendored
Normal file
494
third_party/pcre/pcre2_context.c
vendored
Normal file
|
@ -0,0 +1,494 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Default malloc/free functions *
|
||||
*************************************************/
|
||||
|
||||
/* Ignore the "user data" argument in each case. */
|
||||
|
||||
static void *default_malloc(size_t size, void *data)
|
||||
{
|
||||
(void)data;
|
||||
return malloc(size);
|
||||
}
|
||||
|
||||
|
||||
static void default_free(void *block, void *data)
|
||||
{
|
||||
(void)data;
|
||||
free(block);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get a block and save memory control *
|
||||
*************************************************/
|
||||
|
||||
/* This internal function is called to get a block of memory in which the
|
||||
memory control data is to be stored at the start for future use.
|
||||
|
||||
Arguments:
|
||||
size amount of memory required
|
||||
memctl pointer to a memctl block or NULL
|
||||
|
||||
Returns: pointer to memory or NULL on failure
|
||||
*/
|
||||
|
||||
extern void *
|
||||
PRIV(memctl_malloc)(size_t size, pcre2_memctl *memctl)
|
||||
{
|
||||
pcre2_memctl *newmemctl;
|
||||
void *yield = (memctl == NULL)? malloc(size) :
|
||||
memctl->malloc(size, memctl->memory_data);
|
||||
if (yield == NULL) return NULL;
|
||||
newmemctl = (pcre2_memctl *)yield;
|
||||
if (memctl == NULL)
|
||||
{
|
||||
newmemctl->malloc = default_malloc;
|
||||
newmemctl->free = default_free;
|
||||
newmemctl->memory_data = NULL;
|
||||
}
|
||||
else *newmemctl = *memctl;
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Create and initialize contexts *
|
||||
*************************************************/
|
||||
|
||||
/* Initializing for compile and match contexts is done in separate, private
|
||||
functions so that these can be called from functions such as pcre2_compile()
|
||||
when an external context is not supplied. The initializing functions have an
|
||||
option to set up default memory management. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_general_context_create(void *(*private_malloc)(size_t, void *),
|
||||
void (*private_free)(void *, void *), void *memory_data)
|
||||
{
|
||||
pcre2_general_context *gcontext;
|
||||
if (private_malloc == NULL) private_malloc = default_malloc;
|
||||
if (private_free == NULL) private_free = default_free;
|
||||
gcontext = private_malloc(sizeof(pcre2_real_general_context), memory_data);
|
||||
if (gcontext == NULL) return NULL;
|
||||
gcontext->memctl.malloc = private_malloc;
|
||||
gcontext->memctl.free = private_free;
|
||||
gcontext->memctl.memory_data = memory_data;
|
||||
return gcontext;
|
||||
}
|
||||
|
||||
|
||||
/* A default compile context is set up to save having to initialize at run time
|
||||
when no context is supplied to the compile function. */
|
||||
|
||||
const pcre2_compile_context PRIV(default_compile_context) = {
|
||||
{ default_malloc, default_free, NULL }, /* Default memory handling */
|
||||
NULL, /* Stack guard */
|
||||
NULL, /* Stack guard data */
|
||||
PRIV(default_tables), /* Character tables */
|
||||
PCRE2_UNSET, /* Max pattern length */
|
||||
BSR_DEFAULT, /* Backslash R default */
|
||||
NEWLINE_DEFAULT, /* Newline convention */
|
||||
PARENS_NEST_LIMIT, /* As it says */
|
||||
0 }; /* Extra options */
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_compile_context_create(pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_compile_context *ccontext = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_real_compile_context), (pcre2_memctl *)gcontext);
|
||||
if (ccontext == NULL) return NULL;
|
||||
*ccontext = PRIV(default_compile_context);
|
||||
if (gcontext != NULL)
|
||||
*((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext);
|
||||
return ccontext;
|
||||
}
|
||||
|
||||
|
||||
/* A default match context is set up to save having to initialize at run time
|
||||
when no context is supplied to a match function. */
|
||||
|
||||
const pcre2_match_context PRIV(default_match_context) = {
|
||||
{ default_malloc, default_free, NULL },
|
||||
#ifdef SUPPORT_JIT
|
||||
NULL, /* JIT callback */
|
||||
NULL, /* JIT callback data */
|
||||
#endif
|
||||
NULL, /* Callout function */
|
||||
NULL, /* Callout data */
|
||||
NULL, /* Substitute callout function */
|
||||
NULL, /* Substitute callout data */
|
||||
PCRE2_UNSET, /* Offset limit */
|
||||
HEAP_LIMIT,
|
||||
MATCH_LIMIT,
|
||||
MATCH_LIMIT_DEPTH };
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_match_context_create(pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_match_context *mcontext = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_real_match_context), (pcre2_memctl *)gcontext);
|
||||
if (mcontext == NULL) return NULL;
|
||||
*mcontext = PRIV(default_match_context);
|
||||
if (gcontext != NULL)
|
||||
*((pcre2_memctl *)mcontext) = *((pcre2_memctl *)gcontext);
|
||||
return mcontext;
|
||||
}
|
||||
|
||||
|
||||
/* A default convert context is set up to save having to initialize at run time
|
||||
when no context is supplied to the convert function. */
|
||||
|
||||
const pcre2_convert_context PRIV(default_convert_context) = {
|
||||
{ default_malloc, default_free, NULL }, /* Default memory handling */
|
||||
#ifdef _WIN32
|
||||
CHAR_BACKSLASH, /* Default path separator */
|
||||
CHAR_GRAVE_ACCENT /* Default escape character */
|
||||
#else /* Not Windows */
|
||||
CHAR_SLASH, /* Default path separator */
|
||||
CHAR_BACKSLASH /* Default escape character */
|
||||
#endif
|
||||
};
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_convert_context_create(pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_convert_context *ccontext = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_real_convert_context), (pcre2_memctl *)gcontext);
|
||||
if (ccontext == NULL) return NULL;
|
||||
*ccontext = PRIV(default_convert_context);
|
||||
if (gcontext != NULL)
|
||||
*((pcre2_memctl *)ccontext) = *((pcre2_memctl *)gcontext);
|
||||
return ccontext;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Context copy functions *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_general_context_copy(pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_general_context *new =
|
||||
gcontext->memctl.malloc(sizeof(pcre2_real_general_context),
|
||||
gcontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, gcontext, sizeof(pcre2_real_general_context));
|
||||
return new;
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_compile_context_copy(pcre2_compile_context *ccontext)
|
||||
{
|
||||
pcre2_compile_context *new =
|
||||
ccontext->memctl.malloc(sizeof(pcre2_real_compile_context),
|
||||
ccontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, ccontext, sizeof(pcre2_real_compile_context));
|
||||
return new;
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_match_context_copy(pcre2_match_context *mcontext)
|
||||
{
|
||||
pcre2_match_context *new =
|
||||
mcontext->memctl.malloc(sizeof(pcre2_real_match_context),
|
||||
mcontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, mcontext, sizeof(pcre2_real_match_context));
|
||||
return new;
|
||||
}
|
||||
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_convert_context_copy(pcre2_convert_context *ccontext)
|
||||
{
|
||||
pcre2_convert_context *new =
|
||||
ccontext->memctl.malloc(sizeof(pcre2_real_convert_context),
|
||||
ccontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, ccontext, sizeof(pcre2_real_convert_context));
|
||||
return new;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Context free functions *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_general_context_free(pcre2_general_context *gcontext)
|
||||
{
|
||||
if (gcontext != NULL)
|
||||
gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_compile_context_free(pcre2_compile_context *ccontext)
|
||||
{
|
||||
if (ccontext != NULL)
|
||||
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_match_context_free(pcre2_match_context *mcontext)
|
||||
{
|
||||
if (mcontext != NULL)
|
||||
mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_convert_context_free(pcre2_convert_context *ccontext)
|
||||
{
|
||||
if (ccontext != NULL)
|
||||
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Set values in contexts *
|
||||
*************************************************/
|
||||
|
||||
/* All these functions return 0 for success or PCRE2_ERROR_BADDATA if invalid
|
||||
data is given. Only some of the functions are able to test the validity of the
|
||||
data. */
|
||||
|
||||
|
||||
/* ------------ Compile context ------------ */
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
||||
const uint8_t *tables)
|
||||
{
|
||||
ccontext->tables = tables;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value)
|
||||
{
|
||||
switch(value)
|
||||
{
|
||||
case PCRE2_BSR_ANYCRLF:
|
||||
case PCRE2_BSR_UNICODE:
|
||||
ccontext->bsr_convention = value;
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return PCRE2_ERROR_BADDATA;
|
||||
}
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE length)
|
||||
{
|
||||
ccontext->max_pattern_length = length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline)
|
||||
{
|
||||
switch(newline)
|
||||
{
|
||||
case PCRE2_NEWLINE_CR:
|
||||
case PCRE2_NEWLINE_LF:
|
||||
case PCRE2_NEWLINE_CRLF:
|
||||
case PCRE2_NEWLINE_ANY:
|
||||
case PCRE2_NEWLINE_ANYCRLF:
|
||||
case PCRE2_NEWLINE_NUL:
|
||||
ccontext->newline_convention = newline;
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return PCRE2_ERROR_BADDATA;
|
||||
}
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit)
|
||||
{
|
||||
ccontext->parens_nest_limit = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t options)
|
||||
{
|
||||
ccontext->extra_options = options;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard)(uint32_t, void *), void *user_data)
|
||||
{
|
||||
ccontext->stack_guard = guard;
|
||||
ccontext->stack_guard_data = user_data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* ------------ Match context ------------ */
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_callout(pcre2_match_context *mcontext,
|
||||
int (*callout)(pcre2_callout_block *, void *), void *callout_data)
|
||||
{
|
||||
mcontext->callout = callout;
|
||||
mcontext->callout_data = callout_data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||
int (*substitute_callout)(pcre2_substitute_callout_block *, void *),
|
||||
void *substitute_callout_data)
|
||||
{
|
||||
mcontext->substitute_callout = substitute_callout;
|
||||
mcontext->substitute_callout_data = substitute_callout_data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||
{
|
||||
mcontext->heap_limit = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||
{
|
||||
mcontext->match_limit = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_depth_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||
{
|
||||
mcontext->depth_limit = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE limit)
|
||||
{
|
||||
mcontext->offset_limit = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* These functions became obsolete at release 10.30. The first is kept as a
|
||||
synonym for backwards compatibility. The second now does nothing. Exclude both
|
||||
from coverage reports. */
|
||||
|
||||
/* LCOV_EXCL_START */
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||
{
|
||||
return pcre2_set_depth_limit(mcontext, limit);
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_recursion_memory_management(pcre2_match_context *mcontext,
|
||||
void *(*mymalloc)(size_t, void *), void (*myfree)(void *, void *),
|
||||
void *mydata)
|
||||
{
|
||||
(void)mcontext;
|
||||
(void)mymalloc;
|
||||
(void)myfree;
|
||||
(void)mydata;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* LCOV_EXCL_STOP */
|
||||
|
||||
|
||||
/* ------------ Convert context ------------ */
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_glob_separator(pcre2_convert_context *ccontext, uint32_t separator)
|
||||
{
|
||||
if (separator != CHAR_SLASH && separator != CHAR_BACKSLASH &&
|
||||
separator != CHAR_DOT) return PCRE2_ERROR_BADDATA;
|
||||
ccontext->glob_separator = separator;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_glob_escape(pcre2_convert_context *ccontext, uint32_t escape)
|
||||
{
|
||||
if (escape > 255 || (escape != 0 && !ispunct(escape)))
|
||||
return PCRE2_ERROR_BADDATA;
|
||||
ccontext->glob_escape = escape;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End of pcre2_context.c */
|
||||
|
1181
third_party/pcre/pcre2_convert.c
vendored
Normal file
1181
third_party/pcre/pcre2_convert.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
4066
third_party/pcre/pcre2_dfa_match.c
vendored
Normal file
4066
third_party/pcre/pcre2_dfa_match.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
341
third_party/pcre/pcre2_error.c
vendored
Normal file
341
third_party/pcre/pcre2_error.c
vendored
Normal file
|
@ -0,0 +1,341 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
#define STRING(a) # a
|
||||
#define XSTRING(s) STRING(s)
|
||||
|
||||
/* The texts of compile-time error messages. Compile-time error numbers start
|
||||
at COMPILE_ERROR_BASE (100).
|
||||
|
||||
This used to be a table of strings, but in order to reduce the number of
|
||||
relocations needed when a shared library is loaded dynamically, it is now one
|
||||
long string. We cannot use a table of offsets, because the lengths of inserts
|
||||
such as XSTRING(MAX_NAME_SIZE) are not known. Instead,
|
||||
pcre2_get_error_message() counts through to the one it wants - this isn't a
|
||||
performance issue because these strings are used only when there is an error.
|
||||
|
||||
Each substring ends with \0 to insert a null character. This includes the final
|
||||
substring, so that the whole string ends with \0\0, which can be detected when
|
||||
counting through. */
|
||||
|
||||
static const unsigned char compile_error_texts[] =
|
||||
"no error\0"
|
||||
"\\ at end of pattern\0"
|
||||
"\\c at end of pattern\0"
|
||||
"unrecognized character follows \\\0"
|
||||
"numbers out of order in {} quantifier\0"
|
||||
/* 5 */
|
||||
"number too big in {} quantifier\0"
|
||||
"missing terminating ] for character class\0"
|
||||
"escape sequence is invalid in character class\0"
|
||||
"range out of order in character class\0"
|
||||
"quantifier does not follow a repeatable item\0"
|
||||
/* 10 */
|
||||
"internal error: unexpected repeat\0"
|
||||
"unrecognized character after (? or (?-\0"
|
||||
"POSIX named classes are supported only within a class\0"
|
||||
"POSIX collating elements are not supported\0"
|
||||
"missing closing parenthesis\0"
|
||||
/* 15 */
|
||||
"reference to non-existent subpattern\0"
|
||||
"pattern passed as NULL\0"
|
||||
"unrecognised compile-time option bit(s)\0"
|
||||
"missing ) after (?# comment\0"
|
||||
"parentheses are too deeply nested\0"
|
||||
/* 20 */
|
||||
"regular expression is too large\0"
|
||||
"failed to allocate heap memory\0"
|
||||
"unmatched closing parenthesis\0"
|
||||
"internal error: code overflow\0"
|
||||
"missing closing parenthesis for condition\0"
|
||||
/* 25 */
|
||||
"lookbehind assertion is not fixed length\0"
|
||||
"a relative value of zero is not allowed\0"
|
||||
"conditional subpattern contains more than two branches\0"
|
||||
"assertion expected after (?( or (?(?C)\0"
|
||||
"digit expected after (?+ or (?-\0"
|
||||
/* 30 */
|
||||
"unknown POSIX class name\0"
|
||||
"internal error in pcre2_study(): should not occur\0"
|
||||
"this version of PCRE2 does not have Unicode support\0"
|
||||
"parentheses are too deeply nested (stack check)\0"
|
||||
"character code point value in \\x{} or \\o{} is too large\0"
|
||||
/* 35 */
|
||||
"lookbehind is too complicated\0"
|
||||
"\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0"
|
||||
"PCRE2 does not support \\F, \\L, \\l, \\N{name}, \\U, or \\u\0"
|
||||
"number after (?C is greater than 255\0"
|
||||
"closing parenthesis for (?C expected\0"
|
||||
/* 40 */
|
||||
"invalid escape sequence in (*VERB) name\0"
|
||||
"unrecognized character after (?P\0"
|
||||
"syntax error in subpattern name (missing terminator?)\0"
|
||||
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
|
||||
"subpattern name must start with a non-digit\0"
|
||||
/* 45 */
|
||||
"this version of PCRE2 does not have support for \\P, \\p, or \\X\0"
|
||||
"malformed \\P or \\p sequence\0"
|
||||
"unknown property after \\P or \\p\0"
|
||||
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0"
|
||||
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
|
||||
/* 50 */
|
||||
"invalid range in character class\0"
|
||||
"octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
|
||||
"internal error: overran compiling workspace\0"
|
||||
"internal error: previously-checked referenced subpattern not found\0"
|
||||
"DEFINE subpattern contains more than one branch\0"
|
||||
/* 55 */
|
||||
"missing opening brace after \\o\0"
|
||||
"internal error: unknown newline setting\0"
|
||||
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
|
||||
"(?R (recursive pattern call) must be followed by a closing parenthesis\0"
|
||||
/* "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" */
|
||||
"obsolete error (should not occur)\0" /* Was the above */
|
||||
/* 60 */
|
||||
"(*VERB) not recognized or malformed\0"
|
||||
"subpattern number is too big\0"
|
||||
"subpattern name expected\0"
|
||||
"internal error: parsed pattern overflow\0"
|
||||
"non-octal character in \\o{} (closing brace missing?)\0"
|
||||
/* 65 */
|
||||
"different names for subpatterns of the same number are not allowed\0"
|
||||
"(*MARK) must have an argument\0"
|
||||
"non-hex character in \\x{} (closing brace missing?)\0"
|
||||
#ifndef EBCDIC
|
||||
"\\c must be followed by a printable ASCII character\0"
|
||||
#else
|
||||
"\\c must be followed by a letter or one of [\\]^_?\0"
|
||||
#endif
|
||||
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
|
||||
/* 70 */
|
||||
"internal error: unknown meta code in check_lookbehinds()\0"
|
||||
"\\N is not supported in a class\0"
|
||||
"callout string is too long\0"
|
||||
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
|
||||
"using UTF is disabled by the application\0"
|
||||
/* 75 */
|
||||
"using UCP is disabled by the application\0"
|
||||
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
|
||||
"character code point value in \\u.... sequence is too large\0"
|
||||
"digits missing in \\x{} or \\o{} or \\N{U+}\0"
|
||||
"syntax error or number too big in (?(VERSION condition\0"
|
||||
/* 80 */
|
||||
"internal error: unknown opcode in auto_possessify()\0"
|
||||
"missing terminating delimiter for callout with string argument\0"
|
||||
"unrecognized string delimiter follows (?C\0"
|
||||
"using \\C is disabled by the application\0"
|
||||
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
|
||||
/* 85 */
|
||||
"using \\C is disabled in this PCRE2 library\0"
|
||||
"regular expression is too complicated\0"
|
||||
"lookbehind assertion is too long\0"
|
||||
"pattern string is longer than the limit set by the application\0"
|
||||
"internal error: unknown code in parsed pattern\0"
|
||||
/* 90 */
|
||||
"internal error: bad code value in parsed_skip()\0"
|
||||
"PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0"
|
||||
"invalid option bits with PCRE2_LITERAL\0"
|
||||
"\\N{U+dddd} is supported only in Unicode (UTF) mode\0"
|
||||
"invalid hyphen in option setting\0"
|
||||
/* 95 */
|
||||
"(*alpha_assertion) not recognized\0"
|
||||
"script runs require Unicode support, which this version of PCRE2 does not have\0"
|
||||
"too many capturing groups (maximum 65535)\0"
|
||||
"atomic assertion expected after (?( or (?(?C)\0"
|
||||
"\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
||||
static const unsigned char match_error_texts[] =
|
||||
"no error\0"
|
||||
"no match\0"
|
||||
"partial match\0"
|
||||
"UTF-8 error: 1 byte missing at end\0"
|
||||
"UTF-8 error: 2 bytes missing at end\0"
|
||||
/* 5 */
|
||||
"UTF-8 error: 3 bytes missing at end\0"
|
||||
"UTF-8 error: 4 bytes missing at end\0"
|
||||
"UTF-8 error: 5 bytes missing at end\0"
|
||||
"UTF-8 error: byte 2 top bits not 0x80\0"
|
||||
"UTF-8 error: byte 3 top bits not 0x80\0"
|
||||
/* 10 */
|
||||
"UTF-8 error: byte 4 top bits not 0x80\0"
|
||||
"UTF-8 error: byte 5 top bits not 0x80\0"
|
||||
"UTF-8 error: byte 6 top bits not 0x80\0"
|
||||
"UTF-8 error: 5-byte character is not allowed (RFC 3629)\0"
|
||||
"UTF-8 error: 6-byte character is not allowed (RFC 3629)\0"
|
||||
/* 15 */
|
||||
"UTF-8 error: code points greater than 0x10ffff are not defined\0"
|
||||
"UTF-8 error: code points 0xd800-0xdfff are not defined\0"
|
||||
"UTF-8 error: overlong 2-byte sequence\0"
|
||||
"UTF-8 error: overlong 3-byte sequence\0"
|
||||
"UTF-8 error: overlong 4-byte sequence\0"
|
||||
/* 20 */
|
||||
"UTF-8 error: overlong 5-byte sequence\0"
|
||||
"UTF-8 error: overlong 6-byte sequence\0"
|
||||
"UTF-8 error: isolated byte with 0x80 bit set\0"
|
||||
"UTF-8 error: illegal byte (0xfe or 0xff)\0"
|
||||
"UTF-16 error: missing low surrogate at end\0"
|
||||
/* 25 */
|
||||
"UTF-16 error: invalid low surrogate\0"
|
||||
"UTF-16 error: isolated low surrogate\0"
|
||||
"UTF-32 error: code points 0xd800-0xdfff are not defined\0"
|
||||
"UTF-32 error: code points greater than 0x10ffff are not defined\0"
|
||||
"bad data value\0"
|
||||
/* 30 */
|
||||
"patterns do not all use the same character tables\0"
|
||||
"magic number missing\0"
|
||||
"pattern compiled in wrong mode: 8/16/32-bit error\0"
|
||||
"bad offset value\0"
|
||||
"bad option value\0"
|
||||
/* 35 */
|
||||
"invalid replacement string\0"
|
||||
"bad offset into UTF string\0"
|
||||
"callout error code\0" /* Never returned by PCRE2 itself */
|
||||
"invalid data in workspace for DFA restart\0"
|
||||
"too much recursion for DFA matching\0"
|
||||
/* 40 */
|
||||
"backreference condition or recursion test is not supported for DFA matching\0"
|
||||
"function is not supported for DFA matching\0"
|
||||
"pattern contains an item that is not supported for DFA matching\0"
|
||||
"workspace size exceeded in DFA matching\0"
|
||||
"internal error - pattern overwritten?\0"
|
||||
/* 45 */
|
||||
"bad JIT option\0"
|
||||
"JIT stack limit reached\0"
|
||||
"match limit exceeded\0"
|
||||
"no more memory\0"
|
||||
"unknown substring\0"
|
||||
/* 50 */
|
||||
"non-unique substring name\0"
|
||||
"NULL argument passed with non-zero length\0"
|
||||
"nested recursion at the same subject position\0"
|
||||
"matching depth limit exceeded\0"
|
||||
"requested value is not available\0"
|
||||
/* 55 */
|
||||
"requested value is not set\0"
|
||||
"offset limit set without PCRE2_USE_OFFSET_LIMIT\0"
|
||||
"bad escape sequence in replacement string\0"
|
||||
"expected closing curly bracket in replacement string\0"
|
||||
"bad substitution in replacement string\0"
|
||||
/* 60 */
|
||||
"match with end before start or start moved backwards is not supported\0"
|
||||
"too many replacements (more than INT_MAX)\0"
|
||||
"bad serialized data\0"
|
||||
"heap limit exceeded\0"
|
||||
"invalid syntax\0"
|
||||
/* 65 */
|
||||
"internal error - duplicate substitution match\0"
|
||||
"PCRE2_MATCH_INVALID_UTF is not supported for DFA matching\0"
|
||||
;
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Return error message *
|
||||
*************************************************/
|
||||
|
||||
/* This function copies an error message into a buffer whose units are of an
|
||||
appropriate width. Error numbers are positive for compile-time errors, and
|
||||
negative for match-time errors (except for UTF errors), but the numbers are all
|
||||
distinct.
|
||||
|
||||
Arguments:
|
||||
enumber error number
|
||||
buffer where to put the message (zero terminated)
|
||||
size size of the buffer in code units
|
||||
|
||||
Returns: length of message if all is well
|
||||
negative on error
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_get_error_message(int enumber, PCRE2_UCHAR *buffer, PCRE2_SIZE size)
|
||||
{
|
||||
const unsigned char *message;
|
||||
PCRE2_SIZE i;
|
||||
int n;
|
||||
|
||||
if (size == 0) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
if (enumber >= COMPILE_ERROR_BASE) /* Compile error */
|
||||
{
|
||||
message = compile_error_texts;
|
||||
n = enumber - COMPILE_ERROR_BASE;
|
||||
}
|
||||
else if (enumber < 0) /* Match or UTF error */
|
||||
{
|
||||
message = match_error_texts;
|
||||
n = -enumber;
|
||||
}
|
||||
else /* Invalid error number */
|
||||
{
|
||||
message = (unsigned char *)"\0"; /* Empty message list */
|
||||
n = 1;
|
||||
}
|
||||
|
||||
for (; n > 0; n--)
|
||||
{
|
||||
while (*message++ != CHAR_NUL) {};
|
||||
if (*message == CHAR_NUL) return PCRE2_ERROR_BADDATA;
|
||||
}
|
||||
|
||||
for (i = 0; *message != 0; i++)
|
||||
{
|
||||
if (i >= size - 1)
|
||||
{
|
||||
buffer[i] = 0; /* Terminate partial message */
|
||||
return PCRE2_ERROR_NOMEMORY;
|
||||
}
|
||||
buffer[i] = *message++;
|
||||
}
|
||||
|
||||
buffer[i] = 0;
|
||||
return (int)i;
|
||||
}
|
||||
|
||||
/* End of pcre2_error.c */
|
148
third_party/pcre/pcre2_extuni.c
vendored
Normal file
148
third_party/pcre/pcre2_extuni.c
vendored
Normal file
|
@ -0,0 +1,148 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains an internal function that is used to match a Unicode
|
||||
extended grapheme sequence. It is used by both pcre2_match() and
|
||||
pcre2_def_match(). However, it is called only when Unicode support is being
|
||||
compiled. Nevertheless, we provide a dummy function when there is no Unicode
|
||||
support, because some compilers do not like functionless source files. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/* Dummy function */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
PCRE2_SPTR
|
||||
PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
|
||||
PCRE2_SPTR end_subject, BOOL utf, int *xcount)
|
||||
{
|
||||
(void)c;
|
||||
(void)eptr;
|
||||
(void)start_subject;
|
||||
(void)end_subject;
|
||||
(void)utf;
|
||||
(void)xcount;
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Match an extended grapheme sequence *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
c the first character
|
||||
eptr pointer to next character
|
||||
start_subject pointer to start of subject
|
||||
end_subject pointer to end of subject
|
||||
utf TRUE if in UTF mode
|
||||
xcount pointer to count of additional characters,
|
||||
or NULL if count not needed
|
||||
|
||||
Returns: pointer after the end of the sequence
|
||||
*/
|
||||
|
||||
PCRE2_SPTR
|
||||
PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
|
||||
PCRE2_SPTR end_subject, BOOL utf, int *xcount)
|
||||
{
|
||||
int lgb = UCD_GRAPHBREAK(c);
|
||||
|
||||
while (eptr < end_subject)
|
||||
{
|
||||
int rgb;
|
||||
int len = 1;
|
||||
if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
|
||||
rgb = UCD_GRAPHBREAK(c);
|
||||
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
|
||||
|
||||
/* Not breaking between Regional Indicators is allowed only if there
|
||||
are an even number of preceding RIs. */
|
||||
|
||||
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
|
||||
{
|
||||
int ricount = 0;
|
||||
PCRE2_SPTR bptr = eptr - 1;
|
||||
if (utf) BACKCHAR(bptr);
|
||||
|
||||
/* bptr is pointing to the left-hand character */
|
||||
|
||||
while (bptr > start_subject)
|
||||
{
|
||||
bptr--;
|
||||
if (utf)
|
||||
{
|
||||
BACKCHAR(bptr);
|
||||
GETCHAR(c, bptr);
|
||||
}
|
||||
else
|
||||
c = *bptr;
|
||||
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
|
||||
ricount++;
|
||||
}
|
||||
if ((ricount & 1) != 0) break; /* Grapheme break required */
|
||||
}
|
||||
|
||||
/* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
|
||||
allows any number of them before a following Extended_Pictographic. */
|
||||
|
||||
if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
|
||||
lgb != ucp_gbExtended_Pictographic)
|
||||
lgb = rgb;
|
||||
|
||||
eptr += len;
|
||||
if (xcount != NULL) *xcount += 1;
|
||||
}
|
||||
|
||||
return eptr;
|
||||
}
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_extuni.c */
|
219
third_party/pcre/pcre2_find_bracket.c
vendored
Normal file
219
third_party/pcre/pcre2_find_bracket.c
vendored
Normal file
|
@ -0,0 +1,219 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains a single function that scans through a compiled pattern
|
||||
until it finds a capturing bracket with the given number, or, if the number is
|
||||
negative, an instance of OP_REVERSE for a lookbehind. The function is called
|
||||
from pcre2_compile.c and also from pcre2_study.c when finding the minimum
|
||||
matching length. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Scan compiled regex for specific bracket *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
code points to start of expression
|
||||
utf TRUE in UTF mode
|
||||
number the required bracket number or negative to find a lookbehind
|
||||
|
||||
Returns: pointer to the opcode for the bracket, or NULL if not found
|
||||
*/
|
||||
|
||||
PCRE2_SPTR
|
||||
PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
PCRE2_UCHAR c = *code;
|
||||
|
||||
if (c == OP_END) return NULL;
|
||||
|
||||
/* XCLASS is used for classes that cannot be represented just by a bit map.
|
||||
This includes negated single high-valued characters. CALLOUT_STR is used for
|
||||
callouts with string arguments. In both cases the length in the table is
|
||||
zero; the actual length is stored in the compiled code. */
|
||||
|
||||
if (c == OP_XCLASS) code += GET(code, 1);
|
||||
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
|
||||
|
||||
/* Handle lookbehind */
|
||||
|
||||
else if (c == OP_REVERSE)
|
||||
{
|
||||
if (number < 0) return (PCRE2_UCHAR *)code;
|
||||
code += PRIV(OP_lengths)[c];
|
||||
}
|
||||
|
||||
/* Handle capturing bracket */
|
||||
|
||||
else if (c == OP_CBRA || c == OP_SCBRA ||
|
||||
c == OP_CBRAPOS || c == OP_SCBRAPOS)
|
||||
{
|
||||
int n = (int)GET2(code, 1+LINK_SIZE);
|
||||
if (n == number) return (PCRE2_UCHAR *)code;
|
||||
code += PRIV(OP_lengths)[c];
|
||||
}
|
||||
|
||||
/* Otherwise, we can get the item's length from the table, except that for
|
||||
repeated character types, we have to test for \p and \P, which have an extra
|
||||
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
|
||||
must add in its length. */
|
||||
|
||||
else
|
||||
{
|
||||
switch(c)
|
||||
{
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEPOSQUERY:
|
||||
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
|
||||
break;
|
||||
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEPOSUPTO:
|
||||
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
||||
code += 2;
|
||||
break;
|
||||
|
||||
case OP_MARK:
|
||||
case OP_COMMIT_ARG:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
code += code[1];
|
||||
break;
|
||||
}
|
||||
|
||||
/* Add in the fixed length from the table */
|
||||
|
||||
code += PRIV(OP_lengths)[c];
|
||||
|
||||
/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
|
||||
followed by a multi-byte character. The length in the table is a minimum, so
|
||||
we have to arrange to skip the extra bytes. */
|
||||
|
||||
#ifdef MAYBE_UTF_MULTI
|
||||
if (utf) switch(c)
|
||||
{
|
||||
case OP_CHAR:
|
||||
case OP_CHARI:
|
||||
case OP_NOT:
|
||||
case OP_NOTI:
|
||||
case OP_EXACT:
|
||||
case OP_EXACTI:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTEXACTI:
|
||||
case OP_UPTO:
|
||||
case OP_UPTOI:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_MINUPTO:
|
||||
case OP_MINUPTOI:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_POSUPTO:
|
||||
case OP_POSUPTOI:
|
||||
case OP_NOTPOSUPTO:
|
||||
case OP_NOTPOSUPTOI:
|
||||
case OP_STAR:
|
||||
case OP_STARI:
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTSTARI:
|
||||
case OP_MINSTAR:
|
||||
case OP_MINSTARI:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_POSSTAR:
|
||||
case OP_POSSTARI:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_PLUS:
|
||||
case OP_PLUSI:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_MINPLUS:
|
||||
case OP_MINPLUSI:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_POSPLUS:
|
||||
case OP_POSPLUSI:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_QUERY:
|
||||
case OP_QUERYI:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_MINQUERY:
|
||||
case OP_MINQUERYI:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_POSQUERY:
|
||||
case OP_POSQUERYI:
|
||||
case OP_NOTPOSQUERY:
|
||||
case OP_NOTPOSQUERYI:
|
||||
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
(void)(utf); /* Keep compiler happy by referencing function argument */
|
||||
#endif /* MAYBE_UTF_MULTI */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_find_bracket.c */
|
2047
third_party/pcre/pcre2_internal.h
vendored
Normal file
2047
third_party/pcre/pcre2_internal.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
934
third_party/pcre/pcre2_intmodedep.h
vendored
Normal file
934
third_party/pcre/pcre2_intmodedep.h
vendored
Normal file
|
@ -0,0 +1,934 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains mode-dependent macro and structure definitions. The
|
||||
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
|
||||
These mode-dependent items are kept in a separate file so that they can also be
|
||||
#included multiple times for different code unit widths by pcre2test in order
|
||||
to have access to the hidden structures at all supported widths.
|
||||
|
||||
Some of the mode-dependent macros are required at different widths for
|
||||
different parts of the pcre2test code (in particular, the included
|
||||
pcre_printint.c file). We undefine them here so that they can be re-defined for
|
||||
multiple inclusions. Not all of these are used in pcre2test, but it's easier
|
||||
just to undefine them all. */
|
||||
|
||||
#undef ACROSSCHAR
|
||||
#undef BACKCHAR
|
||||
#undef BYTES2CU
|
||||
#undef CHMAX_255
|
||||
#undef CU2BYTES
|
||||
#undef FORWARDCHAR
|
||||
#undef FORWARDCHARTEST
|
||||
#undef GET
|
||||
#undef GET2
|
||||
#undef GETCHAR
|
||||
#undef GETCHARINC
|
||||
#undef GETCHARINCTEST
|
||||
#undef GETCHARLEN
|
||||
#undef GETCHARLENTEST
|
||||
#undef GETCHARTEST
|
||||
#undef GET_EXTRALEN
|
||||
#undef HAS_EXTRALEN
|
||||
#undef IMM2_SIZE
|
||||
#undef MAX_255
|
||||
#undef MAX_MARK
|
||||
#undef MAX_PATTERN_SIZE
|
||||
#undef MAX_UTF_SINGLE_CU
|
||||
#undef NOT_FIRSTCU
|
||||
#undef PUT
|
||||
#undef PUT2
|
||||
#undef PUT2INC
|
||||
#undef PUTCHAR
|
||||
#undef PUTINC
|
||||
#undef TABLE_GET
|
||||
|
||||
|
||||
|
||||
/* -------------------------- MACROS ----------------------------- */
|
||||
|
||||
/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
|
||||
(always stored in big-endian order in 8-bit mode) by default. These are used,
|
||||
for example, to link from the start of a subpattern to its alternatives and its
|
||||
end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
|
||||
to around 64K, which is big enough for almost everybody. However, I received a
|
||||
request for an even bigger limit. For this reason, and also to make the code
|
||||
easier to maintain, the storing and loading of offsets from the compiled code
|
||||
unit string is now handled by the macros that are defined here.
|
||||
|
||||
The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
|
||||
values of 3 or 4 are also supported. */
|
||||
|
||||
/* ------------------- 8-bit support ------------------ */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
|
||||
#if LINK_SIZE == 2
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (PCRE2_UCHAR)((d) >> 8)), \
|
||||
(a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
|
||||
#define GET(a,n) \
|
||||
(unsigned int)(((a)[n] << 8) | (a)[(n)+1])
|
||||
#define MAX_PATTERN_SIZE (1 << 16)
|
||||
|
||||
#elif LINK_SIZE == 3
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (PCRE2_UCHAR)((d) >> 16)), \
|
||||
(a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
|
||||
(a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
|
||||
#define GET(a,n) \
|
||||
(unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
|
||||
#define MAX_PATTERN_SIZE (1 << 24)
|
||||
|
||||
#elif LINK_SIZE == 4
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (PCRE2_UCHAR)((d) >> 24)), \
|
||||
(a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
|
||||
(a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \
|
||||
(a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
|
||||
#define GET(a,n) \
|
||||
(unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error LINK_SIZE must be 2, 3, or 4
|
||||
#endif
|
||||
|
||||
|
||||
/* ------------------- 16-bit support ------------------ */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
|
||||
#if LINK_SIZE == 2
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 1
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (PCRE2_UCHAR)(d))
|
||||
#define GET(a,n) \
|
||||
(a[n])
|
||||
#define MAX_PATTERN_SIZE (1 << 16)
|
||||
|
||||
#elif LINK_SIZE == 3 || LINK_SIZE == 4
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 2
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (PCRE2_UCHAR)((d) >> 16)), \
|
||||
(a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
|
||||
#define GET(a,n) \
|
||||
(unsigned int)(((a)[n] << 16) | (a)[(n)+1])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error LINK_SIZE must be 2, 3, or 4
|
||||
#endif
|
||||
|
||||
|
||||
/* ------------------- 32-bit support ------------------ */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 1
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d))
|
||||
#define GET(a,n) \
|
||||
(a[n])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error Unsupported compiling mode
|
||||
#endif
|
||||
|
||||
|
||||
/* --------------- Other mode-specific macros ----------------- */
|
||||
|
||||
/* PCRE uses some other (at least) 16-bit quantities that do not change when
|
||||
the size of offsets changes. There are used for repeat counts and for other
|
||||
things such as capturing parenthesis numbers in back references.
|
||||
|
||||
Define the number of code units required to hold a 16-bit count/offset, and
|
||||
macros to load and store such a value. For reasons that I do not understand,
|
||||
the expression in the 8-bit GET2 macro is treated by gcc as a signed
|
||||
expression, even when a is declared as unsigned. It seems that any kind of
|
||||
arithmetic results in a signed value. Hence the cast. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
#define IMM2_SIZE 2
|
||||
#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
|
||||
#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
|
||||
|
||||
#else /* Code units are 16 or 32 bits */
|
||||
#define IMM2_SIZE 1
|
||||
#define GET2(a,n) a[n]
|
||||
#define PUT2(a,n,d) a[n] = d
|
||||
#endif
|
||||
|
||||
/* Other macros that are different for 8-bit mode. The MAX_255 macro checks
|
||||
whether its argument, which is assumed to be one code unit, is less than 256.
|
||||
The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
|
||||
name must fit in one code unit; currently it is set to 255 or 65535. The
|
||||
TABLE_GET macro is used to access elements of tables containing exactly 256
|
||||
items. Its argument is a code unit. When code points can be greater than 255, a
|
||||
check is needed before accessing these tables. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
#define MAX_255(c) TRUE
|
||||
#define MAX_MARK ((1u << 8) - 1)
|
||||
#define TABLE_GET(c, table, default) ((table)[c])
|
||||
#ifdef SUPPORT_UNICODE
|
||||
#define SUPPORT_WIDE_CHARS
|
||||
#define CHMAX_255(c) ((c) <= 255u)
|
||||
#else
|
||||
#define CHMAX_255(c) TRUE
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#else /* Code units are 16 or 32 bits */
|
||||
#define CHMAX_255(c) ((c) <= 255u)
|
||||
#define MAX_255(c) ((c) <= 255u)
|
||||
#define MAX_MARK ((1u << 16) - 1)
|
||||
#define SUPPORT_WIDE_CHARS
|
||||
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
|
||||
#endif
|
||||
|
||||
|
||||
/* ----------------- Character-handling macros ----------------- */
|
||||
|
||||
/* There is a proposed future special "UTF-21" mode, in which only the lowest
|
||||
21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
|
||||
high-order bits available to the application for other uses. In preparation for
|
||||
the future implementation of this mode, there are macros that load a data item
|
||||
and, if in this special mode, mask it to 21 bits. These macros all have names
|
||||
starting with UCHAR21. In all other modes, including the normal 32-bit
|
||||
library, the macros all have the same simple definitions. When the new mode is
|
||||
implemented, it is expected that these definitions will be varied appropriately
|
||||
using #ifdef when compiling the library that supports the special mode. */
|
||||
|
||||
#define UCHAR21(eptr) (*(eptr))
|
||||
#define UCHAR21TEST(eptr) (*(eptr))
|
||||
#define UCHAR21INC(eptr) (*(eptr)++)
|
||||
#define UCHAR21INCTEST(eptr) (*(eptr)++)
|
||||
|
||||
/* When UTF encoding is being used, a character is no longer just a single
|
||||
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
|
||||
handling generate simple sequences when used in the basic mode, and more
|
||||
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
|
||||
used when UTF is not supported. To make sure they can never even appear when
|
||||
UTF support is omitted, we don't even define them. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
|
||||
/* #define MAX_UTF_SINGLE_CU */
|
||||
/* #define HAS_EXTRALEN(c) */
|
||||
/* #define GET_EXTRALEN(c) */
|
||||
/* #define NOT_FIRSTCU(c) */
|
||||
#define GETCHAR(c, eptr) c = *eptr;
|
||||
#define GETCHARTEST(c, eptr) c = *eptr;
|
||||
#define GETCHARINC(c, eptr) c = *eptr++;
|
||||
#define GETCHARINCTEST(c, eptr) c = *eptr++;
|
||||
#define GETCHARLEN(c, eptr, len) c = *eptr;
|
||||
#define PUTCHAR(c, p) (*p = c, 1)
|
||||
/* #define GETCHARLENTEST(c, eptr, len) */
|
||||
/* #define BACKCHAR(eptr) */
|
||||
/* #define FORWARDCHAR(eptr) */
|
||||
/* #define FORWARCCHARTEST(eptr,end) */
|
||||
/* #define ACROSSCHAR(condition, eptr, action) */
|
||||
|
||||
#else /* SUPPORT_UNICODE */
|
||||
|
||||
/* ------------------- 8-bit support ------------------ */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
|
||||
|
||||
/* The largest UTF code point that can be encoded as a single code unit. */
|
||||
|
||||
#define MAX_UTF_SINGLE_CU 127
|
||||
|
||||
/* Tests whether the code point needs extra characters to decode. */
|
||||
|
||||
#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
|
||||
|
||||
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
|
||||
Otherwise it has an undefined behaviour. */
|
||||
|
||||
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
|
||||
|
||||
/* Returns TRUE, if the given value is not the first code unit of a UTF
|
||||
sequence. */
|
||||
|
||||
#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
|
||||
|
||||
/* Get the next UTF-8 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHAR(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (c >= 0xc0u) GETUTF8(c, eptr);
|
||||
|
||||
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
|
||||
pointer. */
|
||||
|
||||
#define GETCHARTEST(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (utf && c >= 0xc0u) GETUTF8(c, eptr);
|
||||
|
||||
/* Get the next UTF-8 character, advancing the pointer. This is called when we
|
||||
know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINC(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (c >= 0xc0u) GETUTF8INC(c, eptr);
|
||||
|
||||
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
|
||||
This is called when we don't know if we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINCTEST(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
|
||||
|
||||
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
|
||||
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLEN(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
|
||||
|
||||
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
|
||||
pointer, incrementing length if there are extra bytes. This is called when we
|
||||
do not know if we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLENTEST(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. This is called only in UTF-8 mode - we don't put a test within the macro
|
||||
because almost all calls are already within a block of UTF-8 only code. */
|
||||
|
||||
#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
|
||||
|
||||
/* Same as above, just in the other direction. */
|
||||
#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
|
||||
#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
|
||||
|
||||
/* Same as above, but it allows a fully customizable form. */
|
||||
#define ACROSSCHAR(condition, eptr, action) \
|
||||
while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
|
||||
|
||||
/* Deposit a character into memory, returning the number of code units. */
|
||||
|
||||
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
|
||||
PRIV(ord2utf)(c,p) : (*p = c, 1))
|
||||
|
||||
|
||||
/* ------------------- 16-bit support ------------------ */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
#define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */
|
||||
|
||||
/* The largest UTF code point that can be encoded as a single code unit. */
|
||||
|
||||
#define MAX_UTF_SINGLE_CU 65535
|
||||
|
||||
/* Tests whether the code point needs extra characters to decode. */
|
||||
|
||||
#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
|
||||
|
||||
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
|
||||
Otherwise it has an undefined behaviour. */
|
||||
|
||||
#define GET_EXTRALEN(c) 1
|
||||
|
||||
/* Returns TRUE, if the given value is not the first code unit of a UTF
|
||||
sequence. */
|
||||
|
||||
#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
|
||||
|
||||
/* Base macro to pick up the low surrogate of a UTF-16 character, not
|
||||
advancing the pointer. */
|
||||
|
||||
#define GETUTF16(c, eptr) \
|
||||
{ c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
|
||||
|
||||
/* Get the next UTF-16 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-16 mode. */
|
||||
|
||||
#define GETCHAR(c, eptr) \
|
||||
c = *eptr; \
|
||||
if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
|
||||
|
||||
/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
|
||||
pointer. */
|
||||
|
||||
#define GETCHARTEST(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
|
||||
|
||||
/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
|
||||
the pointer. */
|
||||
|
||||
#define GETUTF16INC(c, eptr) \
|
||||
{ c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
|
||||
|
||||
/* Get the next UTF-16 character, advancing the pointer. This is called when we
|
||||
know we are in UTF-16 mode. */
|
||||
|
||||
#define GETCHARINC(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
|
||||
|
||||
/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
|
||||
This is called when we don't know if we are in UTF-16 mode. */
|
||||
|
||||
#define GETCHARINCTEST(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
|
||||
|
||||
/* Base macro to pick up the low surrogate of a UTF-16 character, not
|
||||
advancing the pointer, incrementing the length. */
|
||||
|
||||
#define GETUTF16LEN(c, eptr, len) \
|
||||
{ c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
|
||||
|
||||
/* Get the next UTF-16 character, not advancing the pointer, incrementing
|
||||
length if there is a low surrogate. This is called when we know we are in
|
||||
UTF-16 mode. */
|
||||
|
||||
#define GETCHARLEN(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
|
||||
|
||||
/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
|
||||
pointer, incrementing length if there is a low surrogate. This is called when
|
||||
we do not know if we are in UTF-16 mode. */
|
||||
|
||||
#define GETCHARLENTEST(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. This is called only in UTF-16 mode - we don't put a test within the
|
||||
macro because almost all calls are already within a block of UTF-16 only
|
||||
code. */
|
||||
|
||||
#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
|
||||
|
||||
/* Same as above, just in the other direction. */
|
||||
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
|
||||
#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
|
||||
|
||||
/* Same as above, but it allows a fully customizable form. */
|
||||
#define ACROSSCHAR(condition, eptr, action) \
|
||||
if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
|
||||
|
||||
/* Deposit a character into memory, returning the number of code units. */
|
||||
|
||||
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
|
||||
PRIV(ord2utf)(c,p) : (*p = c, 1))
|
||||
|
||||
|
||||
/* ------------------- 32-bit support ------------------ */
|
||||
|
||||
#else
|
||||
|
||||
/* These are trivial for the 32-bit library, since all UTF-32 characters fit
|
||||
into one PCRE2_UCHAR unit. */
|
||||
|
||||
#define MAX_UTF_SINGLE_CU (0x10ffffu)
|
||||
#define HAS_EXTRALEN(c) (0)
|
||||
#define GET_EXTRALEN(c) (0)
|
||||
#define NOT_FIRSTCU(c) (0)
|
||||
|
||||
/* Get the next UTF-32 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-32 mode. */
|
||||
|
||||
#define GETCHAR(c, eptr) \
|
||||
c = *(eptr);
|
||||
|
||||
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
|
||||
pointer. */
|
||||
|
||||
#define GETCHARTEST(c, eptr) \
|
||||
c = *(eptr);
|
||||
|
||||
/* Get the next UTF-32 character, advancing the pointer. This is called when we
|
||||
know we are in UTF-32 mode. */
|
||||
|
||||
#define GETCHARINC(c, eptr) \
|
||||
c = *((eptr)++);
|
||||
|
||||
/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
|
||||
This is called when we don't know if we are in UTF-32 mode. */
|
||||
|
||||
#define GETCHARINCTEST(c, eptr) \
|
||||
c = *((eptr)++);
|
||||
|
||||
/* Get the next UTF-32 character, not advancing the pointer, not incrementing
|
||||
length (since all UTF-32 is of length 1). This is called when we know we are in
|
||||
UTF-32 mode. */
|
||||
|
||||
#define GETCHARLEN(c, eptr, len) \
|
||||
GETCHAR(c, eptr)
|
||||
|
||||
/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
|
||||
pointer, not incrementing the length (since all UTF-32 is of length 1).
|
||||
This is called when we do not know if we are in UTF-32 mode. */
|
||||
|
||||
#define GETCHARLENTEST(c, eptr, len) \
|
||||
GETCHARTEST(c, eptr)
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. This is called only in UTF-32 mode - we don't put a test within the
|
||||
macro because almost all calls are already within a block of UTF-32 only
|
||||
code.
|
||||
|
||||
These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */
|
||||
|
||||
#define BACKCHAR(eptr) do { } while (0)
|
||||
|
||||
/* Same as above, just in the other direction. */
|
||||
|
||||
#define FORWARDCHAR(eptr) do { } while (0)
|
||||
#define FORWARDCHARTEST(eptr,end) do { } while (0)
|
||||
|
||||
/* Same as above, but it allows a fully customizable form. */
|
||||
|
||||
#define ACROSSCHAR(condition, eptr, action) do { } while (0)
|
||||
|
||||
/* Deposit a character into memory, returning the number of code units. */
|
||||
|
||||
#define PUTCHAR(c, p) (*p = c, 1)
|
||||
|
||||
#endif /* UTF-32 character handling */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
|
||||
/* Mode-dependent macros that have the same definition in all modes. */
|
||||
|
||||
#define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
|
||||
#define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
|
||||
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
|
||||
#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE
|
||||
|
||||
|
||||
/* ----------------------- HIDDEN STRUCTURES ----------------------------- */
|
||||
|
||||
/* NOTE: All these structures *must* start with a pcre2_memctl structure. The
|
||||
code that uses them is simpler because it assumes this. */
|
||||
|
||||
/* The real general context structure. At present it holds only data for custom
|
||||
memory control. */
|
||||
|
||||
typedef struct pcre2_real_general_context {
|
||||
pcre2_memctl memctl;
|
||||
} pcre2_real_general_context;
|
||||
|
||||
/* The real compile context structure */
|
||||
|
||||
typedef struct pcre2_real_compile_context {
|
||||
pcre2_memctl memctl;
|
||||
int (*stack_guard)(uint32_t, void *);
|
||||
void *stack_guard_data;
|
||||
const uint8_t *tables;
|
||||
PCRE2_SIZE max_pattern_length;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
uint32_t extra_options;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
||||
typedef struct pcre2_real_match_context {
|
||||
pcre2_memctl memctl;
|
||||
#ifdef SUPPORT_JIT
|
||||
pcre2_jit_callback jit_callback;
|
||||
void *jit_callback_data;
|
||||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
void *callout_data;
|
||||
int (*substitute_callout)(pcre2_substitute_callout_block *, void *);
|
||||
void *substitute_callout_data;
|
||||
PCRE2_SIZE offset_limit;
|
||||
uint32_t heap_limit;
|
||||
uint32_t match_limit;
|
||||
uint32_t depth_limit;
|
||||
} pcre2_real_match_context;
|
||||
|
||||
/* The real convert context structure. */
|
||||
|
||||
typedef struct pcre2_real_convert_context {
|
||||
pcre2_memctl memctl;
|
||||
uint32_t glob_separator;
|
||||
uint32_t glob_escape;
|
||||
} pcre2_real_convert_context;
|
||||
|
||||
/* The real compiled code structure. The type for the blocksize field is
|
||||
defined specially because it is required in pcre2_serialize_decode() when
|
||||
copying the size from possibly unaligned memory into a variable of the same
|
||||
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
||||
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
||||
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
||||
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||
here.) */
|
||||
|
||||
#undef CODE_BLOCKSIZE_TYPE
|
||||
#define CODE_BLOCKSIZE_TYPE size_t
|
||||
|
||||
#undef LOOKBEHIND_MAX
|
||||
#define LOOKBEHIND_MAX UINT16_MAX
|
||||
|
||||
typedef struct pcre2_real_code {
|
||||
pcre2_memctl memctl; /* Memory control fields */
|
||||
const uint8_t *tables; /* The character tables */
|
||||
void *executable_jit; /* Pointer to JIT code */
|
||||
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
|
||||
CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */
|
||||
uint32_t magic_number; /* Paranoid and endianness check */
|
||||
uint32_t compile_options; /* Options passed to pcre2_compile() */
|
||||
uint32_t overall_options; /* Options after processing the pattern */
|
||||
uint32_t extra_options; /* Taken from compile_context */
|
||||
uint32_t flags; /* Various state flags */
|
||||
uint32_t limit_heap; /* Limit set in the pattern */
|
||||
uint32_t limit_match; /* Limit set in the pattern */
|
||||
uint32_t limit_depth; /* Limit set in the pattern */
|
||||
uint32_t first_codeunit; /* Starting code unit */
|
||||
uint32_t last_codeunit; /* This codeunit must be seen */
|
||||
uint16_t bsr_convention; /* What \R matches */
|
||||
uint16_t newline_convention; /* What is a newline? */
|
||||
uint16_t max_lookbehind; /* Longest lookbehind (characters) */
|
||||
uint16_t minlength; /* Minimum length of match */
|
||||
uint16_t top_bracket; /* Highest numbered group */
|
||||
uint16_t top_backref; /* Highest numbered back reference */
|
||||
uint16_t name_entry_size; /* Size (code units) of table entries */
|
||||
uint16_t name_count; /* Number of name entries in the table */
|
||||
} pcre2_real_code;
|
||||
|
||||
/* The real match data structure. Define ovector as large as it can ever
|
||||
actually be so that array bound checkers don't grumble. Memory for this
|
||||
structure is obtained by calling pcre2_match_data_create(), which sets the size
|
||||
as the offset of ovector plus a pair of elements for each capturable string, so
|
||||
the size varies from call to call. As the maximum number of capturing
|
||||
subpatterns is 65535 we must allow for 65536 strings to include the overall
|
||||
match. (See also the heapframe structure below.) */
|
||||
|
||||
struct heapframe; /* Forward reference */
|
||||
|
||||
typedef struct pcre2_real_match_data {
|
||||
pcre2_memctl memctl; /* Memory control fields */
|
||||
const pcre2_real_code *code; /* The pattern used for the match */
|
||||
PCRE2_SPTR subject; /* The subject that was matched */
|
||||
PCRE2_SPTR mark; /* Pointer to last mark */
|
||||
struct heapframe *heapframes; /* Backtracking frames heap memory */
|
||||
PCRE2_SIZE heapframes_size; /* Malloc-ed size */
|
||||
PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
|
||||
PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
|
||||
PCRE2_SIZE startchar; /* Offset to starting code unit */
|
||||
uint8_t matchedby; /* Type of match (normal, JIT, DFA) */
|
||||
uint8_t flags; /* Various flags */
|
||||
uint16_t oveccount; /* Number of pairs */
|
||||
int rc; /* The return code from the match */
|
||||
PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
|
||||
} pcre2_real_match_data;
|
||||
|
||||
|
||||
/* ----------------------- PRIVATE STRUCTURES ----------------------------- */
|
||||
|
||||
/* These structures are not needed for pcre2test. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
|
||||
/* Structures for checking for mutual recursion when scanning compiled or
|
||||
parsed code. */
|
||||
|
||||
typedef struct recurse_check {
|
||||
struct recurse_check *prev;
|
||||
PCRE2_SPTR group;
|
||||
} recurse_check;
|
||||
|
||||
typedef struct parsed_recurse_check {
|
||||
struct parsed_recurse_check *prev;
|
||||
uint32_t *groupptr;
|
||||
} parsed_recurse_check;
|
||||
|
||||
/* Structure for building a cache when filling in recursion offsets. */
|
||||
|
||||
typedef struct recurse_cache {
|
||||
PCRE2_SPTR group;
|
||||
int groupnumber;
|
||||
} recurse_cache;
|
||||
|
||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||
branches, for testing for left recursion while compiling. */
|
||||
|
||||
typedef struct branch_chain {
|
||||
struct branch_chain *outer;
|
||||
PCRE2_UCHAR *current_branch;
|
||||
} branch_chain;
|
||||
|
||||
/* Structure for building a list of named groups during the first pass of
|
||||
compiling. */
|
||||
|
||||
typedef struct named_group {
|
||||
PCRE2_SPTR name; /* Points to the name in the pattern */
|
||||
uint32_t number; /* Group number */
|
||||
uint16_t length; /* Length of the name */
|
||||
uint16_t isdup; /* TRUE if a duplicate */
|
||||
} named_group;
|
||||
|
||||
/* Structure for passing "static" information around between the functions
|
||||
doing the compiling, so that they are thread-safe. */
|
||||
|
||||
typedef struct compile_block {
|
||||
pcre2_real_compile_context *cx; /* Points to the compile context */
|
||||
const uint8_t *lcc; /* Points to lower casing table */
|
||||
const uint8_t *fcc; /* Points to case-flipping table */
|
||||
const uint8_t *cbits; /* Points to character type table */
|
||||
const uint8_t *ctypes; /* Points to table of type maps */
|
||||
PCRE2_SPTR start_workspace; /* The start of working space */
|
||||
PCRE2_SPTR start_code; /* The start of the compiled code */
|
||||
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
||||
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
||||
PCRE2_UCHAR *name_table; /* The name/number table */
|
||||
PCRE2_SIZE workspace_size; /* Size of workspace */
|
||||
PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
|
||||
PCRE2_SIZE erroroffset; /* Offset of error in pattern */
|
||||
uint16_t names_found; /* Number of entries so far */
|
||||
uint16_t name_entry_size; /* Size of each entry */
|
||||
uint16_t parens_depth; /* Depth of nested parentheses */
|
||||
uint16_t assert_depth; /* Depth of nested assertions */
|
||||
open_capitem *open_caps; /* Chain of open capture items */
|
||||
named_group *named_groups; /* Points to vector in pre-compile */
|
||||
uint32_t named_group_list_size; /* Number of entries in the list */
|
||||
uint32_t external_options; /* External (initial) options */
|
||||
uint32_t external_flags; /* External flag bits to be set */
|
||||
uint32_t bracount; /* Count of capturing parentheses */
|
||||
uint32_t lastcapture; /* Last capture encountered */
|
||||
uint32_t *parsed_pattern; /* Parsed pattern buffer */
|
||||
uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */
|
||||
uint32_t *groupinfo; /* Group info vector */
|
||||
uint32_t top_backref; /* Maximum back reference */
|
||||
uint32_t backref_map; /* Bitmap of low back refs */
|
||||
uint32_t nltype; /* Newline type */
|
||||
uint32_t nllen; /* Newline string length */
|
||||
uint32_t class_range_start; /* Overall class range start */
|
||||
uint32_t class_range_end; /* Overall class range end */
|
||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
|
||||
uint32_t req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
int max_lookbehind; /* Maximum lookbehind (characters) */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||
BOOL dupnames; /* Duplicate names exist */
|
||||
} compile_block;
|
||||
|
||||
/* Structure for keeping the properties of the in-memory stack used
|
||||
by the JIT matcher. */
|
||||
|
||||
typedef struct pcre2_real_jit_stack {
|
||||
pcre2_memctl memctl;
|
||||
void* stack;
|
||||
} pcre2_real_jit_stack;
|
||||
|
||||
/* Structure for items in a linked list that represents an explicit recursive
|
||||
call within the pattern when running pcre2_dfa_match(). */
|
||||
|
||||
typedef struct dfa_recursion_info {
|
||||
struct dfa_recursion_info *prevrec;
|
||||
PCRE2_SPTR subject_position;
|
||||
uint32_t group_num;
|
||||
} dfa_recursion_info;
|
||||
|
||||
/* Structure for "stack" frames that are used for remembering backtracking
|
||||
positions during matching. As these are used in a vector, with the ovector item
|
||||
being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
|
||||
only way to check this at compile time is to force an error by generating an
|
||||
array with a negative size. By putting this in a typedef (which is never used),
|
||||
we don't generate any code when all is well. */
|
||||
|
||||
typedef struct heapframe {
|
||||
|
||||
/* The first set of fields are variables that have to be preserved over calls
|
||||
to RRMATCH(), but which do not need to be copied to new frames. */
|
||||
|
||||
PCRE2_SPTR ecode; /* The current position in the pattern */
|
||||
PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */
|
||||
PCRE2_SIZE length; /* Used for character, string, or code lengths */
|
||||
PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */
|
||||
PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */
|
||||
uint32_t rdepth; /* "Recursion" depth */
|
||||
uint32_t group_frame_type; /* Type information for group frames */
|
||||
uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */
|
||||
uint8_t return_id; /* Where to go on in internal "return" */
|
||||
uint8_t op; /* Processing opcode */
|
||||
|
||||
/* At this point, the structure is 16-bit aligned. On most architectures
|
||||
the alignment requirement for a pointer will ensure that the eptr field below
|
||||
is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
|
||||
that is 16-bit aligned. We must therefore ensure that what comes between here
|
||||
and eptr is an odd multiple of 16 bits so as to get back into 32-bit
|
||||
alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
|
||||
fudges in the other cases. In the 32-bit case the padding comes first so that
|
||||
the occu field itself is 32-bit aligned. Without the padding, this structure
|
||||
is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
PCRE2_UCHAR occu[6]; /* Used for other case code units */
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
PCRE2_UCHAR occu[2]; /* Used for other case code units */
|
||||
uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
|
||||
#else
|
||||
uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */
|
||||
PCRE2_UCHAR occu[1]; /* Used for other case code units */
|
||||
#endif
|
||||
|
||||
/* The rest have to be copied from the previous frame whenever a new frame
|
||||
becomes current. The final field is specified as a large vector so that
|
||||
runtime array bound checks don't catch references to it. However, for any
|
||||
specific call to pcre2_match() the memory allocated for each frame structure
|
||||
allows for exactly the right size ovector for the number of capturing
|
||||
parentheses. (See also the comment for pcre2_real_match_data above.) */
|
||||
|
||||
PCRE2_SPTR eptr; /* MUST BE FIRST */
|
||||
PCRE2_SPTR start_match; /* Can be adjusted by \K */
|
||||
PCRE2_SPTR mark; /* Most recent mark on the success path */
|
||||
uint32_t current_recurse; /* Current (deepest) recursion number */
|
||||
uint32_t capture_last; /* Most recent capture */
|
||||
PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
|
||||
PCRE2_SIZE offset_top; /* Offset after highest capture */
|
||||
PCRE2_SIZE ovector[131072]; /* Must be last in the structure */
|
||||
} heapframe;
|
||||
|
||||
/* This typedef is a check that the size of the heapframe structure is a
|
||||
multiple of PCRE2_SIZE. See various comments above. */
|
||||
|
||||
typedef char check_heapframe_size[
|
||||
((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
|
||||
|
||||
/* Structure for computing the alignment of heapframe. */
|
||||
|
||||
typedef struct heapframe_align {
|
||||
char unalign; /* Completely unalign the current offset */
|
||||
heapframe frame; /* Offset is its alignment */
|
||||
} heapframe_align;
|
||||
|
||||
/* This define is the minimum alignment required for a heapframe, in bytes. */
|
||||
|
||||
#define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)
|
||||
|
||||
/* Structure for passing "static" information around between the functions
|
||||
doing traditional NFA matching (pcre2_match() and friends). */
|
||||
|
||||
typedef struct match_block {
|
||||
pcre2_memctl memctl; /* For general use */
|
||||
PCRE2_SIZE heap_limit; /* As it says */
|
||||
uint32_t match_limit; /* As it says */
|
||||
uint32_t match_limit_depth; /* As it says */
|
||||
uint32_t match_call_count; /* Number of times a new frame is created */
|
||||
BOOL hitend; /* Hit the end of the subject at some point */
|
||||
BOOL hasthen; /* Pattern contains (*THEN) */
|
||||
BOOL allowemptypartial; /* Allow empty hard partial */
|
||||
const uint8_t *lcc; /* Points to lower casing table */
|
||||
const uint8_t *fcc; /* Points to case-flipping table */
|
||||
const uint8_t *ctypes; /* Points to table of type maps */
|
||||
PCRE2_SIZE start_offset; /* The start offset value */
|
||||
PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */
|
||||
uint16_t partial; /* PARTIAL options */
|
||||
uint16_t bsr_convention; /* \R interpretation */
|
||||
uint16_t name_count; /* Number of names in name table */
|
||||
uint16_t name_entry_size; /* Size of entry in names table */
|
||||
PCRE2_SPTR name_table; /* Table of group names */
|
||||
PCRE2_SPTR start_code; /* For use when recursing */
|
||||
PCRE2_SPTR start_subject; /* Start of the subject string */
|
||||
PCRE2_SPTR check_subject; /* Where UTF-checked from */
|
||||
PCRE2_SPTR end_subject; /* End of the subject string */
|
||||
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
|
||||
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
|
||||
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
|
||||
PCRE2_SPTR mark; /* Mark pointer to pass back on success */
|
||||
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
|
||||
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
|
||||
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
|
||||
uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
|
||||
uint32_t moptions; /* Match options */
|
||||
uint32_t poptions; /* Pattern options */
|
||||
uint32_t skip_arg_count; /* For counting SKIP_ARGs */
|
||||
uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */
|
||||
uint32_t nltype; /* Newline type */
|
||||
uint32_t nllen; /* Newline string length */
|
||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed */
|
||||
pcre2_callout_block *cb; /* Points to a callout block */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
|
||||
} match_block;
|
||||
|
||||
/* A similar structure is used for the same purpose by the DFA matching
|
||||
functions. */
|
||||
|
||||
typedef struct dfa_match_block {
|
||||
pcre2_memctl memctl; /* For general use */
|
||||
PCRE2_SPTR start_code; /* Start of the compiled pattern */
|
||||
PCRE2_SPTR start_subject ; /* Start of the subject string */
|
||||
PCRE2_SPTR end_subject; /* End of subject string */
|
||||
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
|
||||
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
|
||||
const uint8_t *tables; /* Character tables */
|
||||
PCRE2_SIZE start_offset; /* The start offset value */
|
||||
PCRE2_SIZE heap_limit; /* As it says */
|
||||
PCRE2_SIZE heap_used; /* As it says */
|
||||
uint32_t match_limit; /* As it says */
|
||||
uint32_t match_limit_depth; /* As it says */
|
||||
uint32_t match_call_count; /* Number of calls of internal function */
|
||||
uint32_t moptions; /* Match options */
|
||||
uint32_t poptions; /* Pattern options */
|
||||
uint32_t nltype; /* Newline type */
|
||||
uint32_t nllen; /* Newline string length */
|
||||
BOOL allowemptypartial; /* Allow empty hard partial */
|
||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed */
|
||||
uint16_t bsr_convention; /* \R interpretation */
|
||||
pcre2_callout_block *cb; /* Points to a callout block */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
|
||||
dfa_recursion_info *recursive; /* Linked list of recursion data */
|
||||
} dfa_match_block;
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* End of pcre2_intmodedep.h */
|
163
third_party/pcre/pcre2_maketables.c
vendored
Normal file
163
third_party/pcre/pcre2_maketables.c
vendored
Normal file
|
@ -0,0 +1,163 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains the external function pcre2_maketables(), which builds
|
||||
character tables for PCRE2 in the current locale. The file is compiled on its
|
||||
own as part of the PCRE2 library. It is also included in the compilation of
|
||||
pcre2_dftables.c as a freestanding program, in which case the macro
|
||||
PCRE2_DFTABLES is defined. */
|
||||
|
||||
#ifndef PCRE2_DFTABLES /* Compiling the library */
|
||||
# ifdef HAVE_CONFIG_H
|
||||
# include "config.h"
|
||||
# endif
|
||||
# include "pcre2_internal.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Create PCRE2 character tables *
|
||||
*************************************************/
|
||||
|
||||
/* This function builds a set of character tables for use by PCRE2 and returns
|
||||
a pointer to them. They are build using the ctype functions, and consequently
|
||||
their contents will depend upon the current locale setting. When compiled as
|
||||
part of the library, the store is obtained via a general context malloc, if
|
||||
supplied, but when PCRE2_DFTABLES is defined (when compiling the pcre2_dftables
|
||||
freestanding auxiliary program) malloc() is used, and the function has a
|
||||
different name so as not to clash with the prototype in pcre2.h.
|
||||
|
||||
Arguments: none when PCRE2_DFTABLES is defined
|
||||
else a PCRE2 general context or NULL
|
||||
Returns: pointer to the contiguous block of data
|
||||
else NULL if memory allocation failed
|
||||
*/
|
||||
|
||||
#ifdef PCRE2_DFTABLES /* Included in freestanding pcre2_dftables program */
|
||||
static const uint8_t *maketables(void)
|
||||
{
|
||||
uint8_t *yield = (uint8_t *)malloc(TABLES_LENGTH);
|
||||
|
||||
#else /* Not PCRE2_DFTABLES, that is, compiling the library */
|
||||
PCRE2_EXP_DEFN const uint8_t * PCRE2_CALL_CONVENTION
|
||||
pcre2_maketables(pcre2_general_context *gcontext)
|
||||
{
|
||||
uint8_t *yield = (uint8_t *)((gcontext != NULL)?
|
||||
gcontext->memctl.malloc(TABLES_LENGTH, gcontext->memctl.memory_data) :
|
||||
malloc(TABLES_LENGTH));
|
||||
#endif /* PCRE2_DFTABLES */
|
||||
|
||||
int i;
|
||||
uint8_t *p;
|
||||
|
||||
if (yield == NULL) return NULL;
|
||||
p = yield;
|
||||
|
||||
/* First comes the lower casing table */
|
||||
|
||||
for (i = 0; i < 256; i++) *p++ = tolower(i);
|
||||
|
||||
/* Next the case-flipping table */
|
||||
|
||||
for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
|
||||
|
||||
/* Then the character class tables. Don't try to be clever and save effort on
|
||||
exclusive ones - in some locales things may be different.
|
||||
|
||||
Note that the table for "space" includes everything "isspace" gives, including
|
||||
VT in the default locale. This makes it work for the POSIX class [:space:].
|
||||
From PCRE1 release 8.34 and for all PCRE2 releases it is also correct for Perl
|
||||
space, because Perl added VT at release 5.18.
|
||||
|
||||
Note also that it is possible for a character to be alnum or alpha without
|
||||
being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the
|
||||
fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must
|
||||
test for alnum specially. */
|
||||
|
||||
memset(p, 0, cbit_length);
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
if (isdigit(i)) p[cbit_digit + i/8] |= 1u << (i&7);
|
||||
if (isupper(i)) p[cbit_upper + i/8] |= 1u << (i&7);
|
||||
if (islower(i)) p[cbit_lower + i/8] |= 1u << (i&7);
|
||||
if (isalnum(i)) p[cbit_word + i/8] |= 1u << (i&7);
|
||||
if (i == '_') p[cbit_word + i/8] |= 1u << (i&7);
|
||||
if (isspace(i)) p[cbit_space + i/8] |= 1u << (i&7);
|
||||
if (isxdigit(i)) p[cbit_xdigit + i/8] |= 1u << (i&7);
|
||||
if (isgraph(i)) p[cbit_graph + i/8] |= 1u << (i&7);
|
||||
if (isprint(i)) p[cbit_print + i/8] |= 1u << (i&7);
|
||||
if (ispunct(i)) p[cbit_punct + i/8] |= 1u << (i&7);
|
||||
if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1u << (i&7);
|
||||
}
|
||||
p += cbit_length;
|
||||
|
||||
/* Finally, the character type table. In this, we used to exclude VT from the
|
||||
white space chars, because Perl didn't recognize it as such for \s and for
|
||||
comments within regexes. However, Perl changed at release 5.18, so PCRE1
|
||||
changed at release 8.34 and it's always been this way for PCRE2. */
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
int x = 0;
|
||||
if (isspace(i)) x += ctype_space;
|
||||
if (isalpha(i)) x += ctype_letter;
|
||||
if (islower(i)) x += ctype_lcletter;
|
||||
if (isdigit(i)) x += ctype_digit;
|
||||
if (isalnum(i) || i == '_') x += ctype_word;
|
||||
*p++ = x;
|
||||
}
|
||||
|
||||
return yield;
|
||||
}
|
||||
|
||||
#ifndef PCRE2_DFTABLES /* Compiling the library */
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_maketables_free(pcre2_general_context *gcontext, const uint8_t *tables)
|
||||
{
|
||||
if (gcontext)
|
||||
gcontext->memctl.free((void *)tables, gcontext->memctl.memory_data);
|
||||
else
|
||||
free((void *)tables);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* End of pcre2_maketables.c */
|
7548
third_party/pcre/pcre2_match.c
vendored
Normal file
7548
third_party/pcre/pcre2_match.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
173
third_party/pcre/pcre2_match_data.c
vendored
Normal file
173
third_party/pcre/pcre2_match_data.c
vendored
Normal file
|
@ -0,0 +1,173 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Create a match data block given ovector size *
|
||||
*************************************************/
|
||||
|
||||
/* A minimum of 1 is imposed on the number of ovector pairs. A maximum is also
|
||||
imposed because the oveccount field in a match data block is uintt6_t. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION
|
||||
pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_match_data *yield;
|
||||
if (oveccount < 1) oveccount = 1;
|
||||
if (oveccount > UINT16_MAX) oveccount = UINT16_MAX;
|
||||
yield = PRIV(memctl_malloc)(
|
||||
offsetof(pcre2_match_data, ovector) + 2*oveccount*sizeof(PCRE2_SIZE),
|
||||
(pcre2_memctl *)gcontext);
|
||||
if (yield == NULL) return NULL;
|
||||
yield->oveccount = oveccount;
|
||||
yield->flags = 0;
|
||||
yield->heapframes = NULL;
|
||||
yield->heapframes_size = 0;
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Create a match data block using pattern data *
|
||||
*************************************************/
|
||||
|
||||
/* If no context is supplied, use the memory allocator from the code. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
||||
pcre2_general_context *gcontext)
|
||||
{
|
||||
if (gcontext == NULL) gcontext = (pcre2_general_context *)code;
|
||||
return pcre2_match_data_create(((pcre2_real_code *)code)->top_bracket + 1,
|
||||
gcontext);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free a match data block *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_match_data_free(pcre2_match_data *match_data)
|
||||
{
|
||||
if (match_data != NULL)
|
||||
{
|
||||
if (match_data->heapframes != NULL)
|
||||
match_data->memctl.free(match_data->heapframes,
|
||||
match_data->memctl.memory_data);
|
||||
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
|
||||
match_data->memctl.free((void *)match_data->subject,
|
||||
match_data->memctl.memory_data);
|
||||
match_data->memctl.free(match_data, match_data->memctl.memory_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get last mark in match *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN PCRE2_SPTR PCRE2_CALL_CONVENTION
|
||||
pcre2_get_mark(pcre2_match_data *match_data)
|
||||
{
|
||||
return match_data->mark;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get pointer to ovector *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION
|
||||
pcre2_get_ovector_pointer(pcre2_match_data *match_data)
|
||||
{
|
||||
return match_data->ovector;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get number of ovector slots *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN uint32_t PCRE2_CALL_CONVENTION
|
||||
pcre2_get_ovector_count(pcre2_match_data *match_data)
|
||||
{
|
||||
return match_data->oveccount;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get starting code unit in match *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
|
||||
pcre2_get_startchar(pcre2_match_data *match_data)
|
||||
{
|
||||
return match_data->startchar;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get size of match data block *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
|
||||
pcre2_get_match_data_size(pcre2_match_data *match_data)
|
||||
{
|
||||
return offsetof(pcre2_match_data, ovector) +
|
||||
2 * (match_data->oveccount) * sizeof(PCRE2_SIZE);
|
||||
}
|
||||
|
||||
/* End of pcre2_match_data.c */
|
243
third_party/pcre/pcre2_newline.c
vendored
Normal file
243
third_party/pcre/pcre2_newline.c
vendored
Normal file
|
@ -0,0 +1,243 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains internal functions for testing newlines when more than
|
||||
one kind of newline is to be recognized. When a newline is found, its length is
|
||||
returned. In principle, we could implement several newline "types", each
|
||||
referring to a different set of newline characters. At present, PCRE2 supports
|
||||
only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
|
||||
and NLTYPE_ANY. The full list of Unicode newline characters is taken from
|
||||
http://unicode.org/unicode/reports/tr18/. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check for newline at given position *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called only via the IS_NEWLINE macro, which does so only
|
||||
when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
|
||||
newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit
|
||||
pointed to by ptr is less than the end of the string.
|
||||
|
||||
Arguments:
|
||||
ptr pointer to possible newline
|
||||
type the newline type
|
||||
endptr pointer to the end of the string
|
||||
lenptr where to return the length
|
||||
utf TRUE if in utf mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr,
|
||||
uint32_t *lenptr, BOOL utf)
|
||||
{
|
||||
uint32_t c;
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf) { GETCHAR(c, ptr); } else c = *ptr;
|
||||
#else
|
||||
(void)utf;
|
||||
c = *ptr;
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
if (type == NLTYPE_ANYCRLF) switch(c)
|
||||
{
|
||||
case CHAR_LF:
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
|
||||
case CHAR_CR:
|
||||
*lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* NLTYPE_ANY */
|
||||
|
||||
else switch(c)
|
||||
{
|
||||
#ifdef EBCDIC
|
||||
case CHAR_NEL:
|
||||
#endif
|
||||
case CHAR_LF:
|
||||
case CHAR_VT:
|
||||
case CHAR_FF:
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
|
||||
case CHAR_CR:
|
||||
*lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
#ifndef EBCDIC
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
case CHAR_NEL:
|
||||
*lenptr = utf? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
*lenptr = 3;
|
||||
return TRUE;
|
||||
|
||||
#else /* 16-bit or 32-bit code units */
|
||||
case CHAR_NEL:
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
#endif
|
||||
#endif /* Not EBCDIC */
|
||||
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check for newline at previous position *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called only via the WAS_NEWLINE macro, which does so only
|
||||
when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
|
||||
newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial
|
||||
value of ptr is greater than the start of the string that is being processed.
|
||||
|
||||
Arguments:
|
||||
ptr pointer to possible newline
|
||||
type the newline type
|
||||
startptr pointer to the start of the string
|
||||
lenptr where to return the length
|
||||
utf TRUE if in utf mode
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr,
|
||||
uint32_t *lenptr, BOOL utf)
|
||||
{
|
||||
uint32_t c;
|
||||
ptr--;
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
BACKCHAR(ptr);
|
||||
GETCHAR(c, ptr);
|
||||
}
|
||||
else c = *ptr;
|
||||
#else
|
||||
(void)utf;
|
||||
c = *ptr;
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
if (type == NLTYPE_ANYCRLF) switch(c)
|
||||
{
|
||||
case CHAR_LF:
|
||||
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
case CHAR_CR:
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* NLTYPE_ANY */
|
||||
|
||||
else switch(c)
|
||||
{
|
||||
case CHAR_LF:
|
||||
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
#ifdef EBCDIC
|
||||
case CHAR_NEL:
|
||||
#endif
|
||||
case CHAR_VT:
|
||||
case CHAR_FF:
|
||||
case CHAR_CR:
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
|
||||
#ifndef EBCDIC
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
case CHAR_NEL:
|
||||
*lenptr = utf? 2 : 1;
|
||||
return TRUE;
|
||||
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
*lenptr = 3;
|
||||
return TRUE;
|
||||
|
||||
#else /* 16-bit or 32-bit code units */
|
||||
case CHAR_NEL:
|
||||
case 0x2028: /* LS */
|
||||
case 0x2029: /* PS */
|
||||
*lenptr = 1;
|
||||
return TRUE;
|
||||
#endif
|
||||
#endif /* Not EBCDIC */
|
||||
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_newline.c */
|
120
third_party/pcre/pcre2_ord2utf.c
vendored
Normal file
120
third_party/pcre/pcre2_ord2utf.c
vendored
Normal file
|
@ -0,0 +1,120 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This file contains a function that converts a Unicode character code point
|
||||
into a UTF string. The behaviour is different for each code unit width. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/* If SUPPORT_UNICODE is not defined, this function will never be called.
|
||||
Supply a dummy function because some compilers do not like empty source
|
||||
modules. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
unsigned int
|
||||
PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
|
||||
{
|
||||
(void)(cvalue);
|
||||
(void)(buffer);
|
||||
return 0;
|
||||
}
|
||||
#else /* SUPPORT_UNICODE */
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert code point to UTF *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
cvalue the character value
|
||||
buffer pointer to buffer for result
|
||||
|
||||
Returns: number of code units placed in the buffer
|
||||
*/
|
||||
|
||||
unsigned int
|
||||
PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
|
||||
{
|
||||
/* Convert to UTF-8 */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
int i, j;
|
||||
for (i = 0; i < PRIV(utf8_table1_size); i++)
|
||||
if ((int)cvalue <= PRIV(utf8_table1)[i]) break;
|
||||
buffer += i;
|
||||
for (j = i; j > 0; j--)
|
||||
{
|
||||
*buffer-- = 0x80 | (cvalue & 0x3f);
|
||||
cvalue >>= 6;
|
||||
}
|
||||
*buffer = PRIV(utf8_table2)[i] | cvalue;
|
||||
return i + 1;
|
||||
|
||||
/* Convert to UTF-16 */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if (cvalue <= 0xffff)
|
||||
{
|
||||
*buffer = (PCRE2_UCHAR)cvalue;
|
||||
return 1;
|
||||
}
|
||||
cvalue -= 0x10000;
|
||||
*buffer++ = 0xd800 | (cvalue >> 10);
|
||||
*buffer = 0xdc00 | (cvalue & 0x3ff);
|
||||
return 2;
|
||||
|
||||
/* Convert to UTF-32 */
|
||||
|
||||
#else
|
||||
*buffer = (PCRE2_UCHAR)cvalue;
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre_ord2utf.c */
|
432
third_party/pcre/pcre2_pattern_info.c
vendored
Normal file
432
third_party/pcre/pcre2_pattern_info.c
vendored
Normal file
|
@ -0,0 +1,432 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Return info about compiled pattern *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
code points to compiled code
|
||||
what what information is required
|
||||
where where to put the information; if NULL, return length
|
||||
|
||||
Returns: 0 when data returned
|
||||
> 0 when length requested
|
||||
< 0 on error or unset value
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where)
|
||||
{
|
||||
const pcre2_real_code *re = (pcre2_real_code *)code;
|
||||
|
||||
if (where == NULL) /* Requests field length */
|
||||
{
|
||||
switch(what)
|
||||
{
|
||||
case PCRE2_INFO_ALLOPTIONS:
|
||||
case PCRE2_INFO_ARGOPTIONS:
|
||||
case PCRE2_INFO_BACKREFMAX:
|
||||
case PCRE2_INFO_BSR:
|
||||
case PCRE2_INFO_CAPTURECOUNT:
|
||||
case PCRE2_INFO_DEPTHLIMIT:
|
||||
case PCRE2_INFO_EXTRAOPTIONS:
|
||||
case PCRE2_INFO_FIRSTCODETYPE:
|
||||
case PCRE2_INFO_FIRSTCODEUNIT:
|
||||
case PCRE2_INFO_HASBACKSLASHC:
|
||||
case PCRE2_INFO_HASCRORLF:
|
||||
case PCRE2_INFO_HEAPLIMIT:
|
||||
case PCRE2_INFO_JCHANGED:
|
||||
case PCRE2_INFO_LASTCODETYPE:
|
||||
case PCRE2_INFO_LASTCODEUNIT:
|
||||
case PCRE2_INFO_MATCHEMPTY:
|
||||
case PCRE2_INFO_MATCHLIMIT:
|
||||
case PCRE2_INFO_MAXLOOKBEHIND:
|
||||
case PCRE2_INFO_MINLENGTH:
|
||||
case PCRE2_INFO_NAMEENTRYSIZE:
|
||||
case PCRE2_INFO_NAMECOUNT:
|
||||
case PCRE2_INFO_NEWLINE:
|
||||
return sizeof(uint32_t);
|
||||
|
||||
case PCRE2_INFO_FIRSTBITMAP:
|
||||
return sizeof(const uint8_t *);
|
||||
|
||||
case PCRE2_INFO_JITSIZE:
|
||||
case PCRE2_INFO_SIZE:
|
||||
case PCRE2_INFO_FRAMESIZE:
|
||||
return sizeof(size_t);
|
||||
|
||||
case PCRE2_INFO_NAMETABLE:
|
||||
return sizeof(PCRE2_SPTR);
|
||||
}
|
||||
}
|
||||
|
||||
if (re == NULL) return PCRE2_ERROR_NULL;
|
||||
|
||||
/* Check that the first field in the block is the magic number. If it is not,
|
||||
return with PCRE2_ERROR_BADMAGIC. */
|
||||
|
||||
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
|
||||
|
||||
/* Check that this pattern was compiled in the correct bit mode */
|
||||
|
||||
if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE;
|
||||
|
||||
switch(what)
|
||||
{
|
||||
case PCRE2_INFO_ALLOPTIONS:
|
||||
*((uint32_t *)where) = re->overall_options;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_ARGOPTIONS:
|
||||
*((uint32_t *)where) = re->compile_options;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_BACKREFMAX:
|
||||
*((uint32_t *)where) = re->top_backref;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_BSR:
|
||||
*((uint32_t *)where) = re->bsr_convention;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_CAPTURECOUNT:
|
||||
*((uint32_t *)where) = re->top_bracket;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_DEPTHLIMIT:
|
||||
*((uint32_t *)where) = re->limit_depth;
|
||||
if (re->limit_depth == UINT32_MAX) return PCRE2_ERROR_UNSET;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_EXTRAOPTIONS:
|
||||
*((uint32_t *)where) = re->extra_options;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_FIRSTCODETYPE:
|
||||
*((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)? 1 :
|
||||
((re->flags & PCRE2_STARTLINE) != 0)? 2 : 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_FIRSTCODEUNIT:
|
||||
*((uint32_t *)where) = ((re->flags & PCRE2_FIRSTSET) != 0)?
|
||||
re->first_codeunit : 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_FIRSTBITMAP:
|
||||
*((const uint8_t **)where) = ((re->flags & PCRE2_FIRSTMAPSET) != 0)?
|
||||
&(re->start_bitmap[0]) : NULL;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_FRAMESIZE:
|
||||
*((size_t *)where) = offsetof(heapframe, ovector) +
|
||||
re->top_bracket * 2 * sizeof(PCRE2_SIZE);
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_HASBACKSLASHC:
|
||||
*((uint32_t *)where) = (re->flags & PCRE2_HASBKC) != 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_HASCRORLF:
|
||||
*((uint32_t *)where) = (re->flags & PCRE2_HASCRORLF) != 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_HEAPLIMIT:
|
||||
*((uint32_t *)where) = re->limit_heap;
|
||||
if (re->limit_heap == UINT32_MAX) return PCRE2_ERROR_UNSET;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_JCHANGED:
|
||||
*((uint32_t *)where) = (re->flags & PCRE2_JCHANGED) != 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_JITSIZE:
|
||||
#ifdef SUPPORT_JIT
|
||||
*((size_t *)where) = (re->executable_jit != NULL)?
|
||||
PRIV(jit_get_size)(re->executable_jit) : 0;
|
||||
#else
|
||||
*((size_t *)where) = 0;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_LASTCODETYPE:
|
||||
*((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)? 1 : 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_LASTCODEUNIT:
|
||||
*((uint32_t *)where) = ((re->flags & PCRE2_LASTSET) != 0)?
|
||||
re->last_codeunit : 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_MATCHEMPTY:
|
||||
*((uint32_t *)where) = (re->flags & PCRE2_MATCH_EMPTY) != 0;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_MATCHLIMIT:
|
||||
*((uint32_t *)where) = re->limit_match;
|
||||
if (re->limit_match == UINT32_MAX) return PCRE2_ERROR_UNSET;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_MAXLOOKBEHIND:
|
||||
*((uint32_t *)where) = re->max_lookbehind;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_MINLENGTH:
|
||||
*((uint32_t *)where) = re->minlength;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_NAMEENTRYSIZE:
|
||||
*((uint32_t *)where) = re->name_entry_size;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_NAMECOUNT:
|
||||
*((uint32_t *)where) = re->name_count;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_NAMETABLE:
|
||||
*((PCRE2_SPTR *)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code));
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_NEWLINE:
|
||||
*((uint32_t *)where) = re->newline_convention;
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_SIZE:
|
||||
*((size_t *)where) = re->blocksize;
|
||||
break;
|
||||
|
||||
default: return PCRE2_ERROR_BADOPTION;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Callout enumerator *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
code points to compiled code
|
||||
callback function called for each callout block
|
||||
callout_data user data passed to the callback
|
||||
|
||||
Returns: 0 when successfully completed
|
||||
< 0 on local error
|
||||
!= 0 for callback error
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data)
|
||||
{
|
||||
pcre2_real_code *re = (pcre2_real_code *)code;
|
||||
pcre2_callout_enumerate_block cb;
|
||||
PCRE2_SPTR cc;
|
||||
#ifdef SUPPORT_UNICODE
|
||||
BOOL utf;
|
||||
#endif
|
||||
|
||||
if (re == NULL) return PCRE2_ERROR_NULL;
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||
#endif
|
||||
|
||||
/* Check that the first field in the block is the magic number. If it is not,
|
||||
return with PCRE2_ERROR_BADMAGIC. */
|
||||
|
||||
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
|
||||
|
||||
/* Check that this pattern was compiled in the correct bit mode */
|
||||
|
||||
if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE;
|
||||
|
||||
cb.version = 0;
|
||||
cc = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code))
|
||||
+ re->name_count * re->name_entry_size;
|
||||
|
||||
while (TRUE)
|
||||
{
|
||||
int rc;
|
||||
switch (*cc)
|
||||
{
|
||||
case OP_END:
|
||||
return 0;
|
||||
|
||||
case OP_CHAR:
|
||||
case OP_CHARI:
|
||||
case OP_NOT:
|
||||
case OP_NOTI:
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_EXACT:
|
||||
case OP_POSSTAR:
|
||||
case OP_POSPLUS:
|
||||
case OP_POSQUERY:
|
||||
case OP_POSUPTO:
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_EXACTI:
|
||||
case OP_POSSTARI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_POSQUERYI:
|
||||
case OP_POSUPTOI:
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTPOSQUERY:
|
||||
case OP_NOTPOSUPTO:
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
|
||||
#endif
|
||||
break;
|
||||
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEPOSQUERY:
|
||||
case OP_TYPEPOSUPTO:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (cc[-1] == OP_PROP || cc[-1] == OP_NOTPROP) cc += 2;
|
||||
#endif
|
||||
break;
|
||||
|
||||
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
||||
case OP_XCLASS:
|
||||
cc += GET(cc, 1);
|
||||
break;
|
||||
#endif
|
||||
|
||||
case OP_MARK:
|
||||
case OP_COMMIT_ARG:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
cc += PRIV(OP_lengths)[*cc] + cc[1];
|
||||
break;
|
||||
|
||||
case OP_CALLOUT:
|
||||
cb.pattern_position = GET(cc, 1);
|
||||
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||
cb.callout_number = cc[1 + 2*LINK_SIZE];
|
||||
cb.callout_string_offset = 0;
|
||||
cb.callout_string_length = 0;
|
||||
cb.callout_string = NULL;
|
||||
rc = callback(&cb, callout_data);
|
||||
if (rc != 0) return rc;
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
break;
|
||||
|
||||
case OP_CALLOUT_STR:
|
||||
cb.pattern_position = GET(cc, 1);
|
||||
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||
cb.callout_number = 0;
|
||||
cb.callout_string_offset = GET(cc, 1 + 3*LINK_SIZE);
|
||||
cb.callout_string_length =
|
||||
GET(cc, 1 + 2*LINK_SIZE) - (1 + 4*LINK_SIZE) - 2;
|
||||
cb.callout_string = cc + (1 + 4*LINK_SIZE) + 1;
|
||||
rc = callback(&cb, callout_data);
|
||||
if (rc != 0) return rc;
|
||||
cc += GET(cc, 1 + 2*LINK_SIZE);
|
||||
break;
|
||||
|
||||
default:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_pattern_info.c */
|
868
third_party/pcre/pcre2_printint.inc
vendored
Normal file
868
third_party/pcre/pcre2_printint.inc
vendored
Normal file
|
@ -0,0 +1,868 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains a PCRE private debugging function for printing out the
|
||||
internal form of a compiled regular expression, along with some supporting
|
||||
local functions. This source file is #included in pcre2test.c at each supported
|
||||
code unit width, with PCRE2_SUFFIX set appropriately, just like the functions
|
||||
that comprise the library. It can also optionally be included in
|
||||
pcre2_compile.c for detailed debugging in error situations. */
|
||||
|
||||
|
||||
/* Tables of operator names. The same 8-bit table is used for all code unit
|
||||
widths, so it must be defined only once. The list itself is defined in
|
||||
pcre2_internal.h, which is #included by pcre2test before this file. */
|
||||
|
||||
#ifndef OP_LISTS_DEFINED
|
||||
static const char *OP_names[] = { OP_NAME_LIST };
|
||||
#define OP_LISTS_DEFINED
|
||||
#endif
|
||||
|
||||
/* The functions and tables herein must all have mode-dependent names. */
|
||||
|
||||
#define OP_lengths PCRE2_SUFFIX(OP_lengths_)
|
||||
#define get_ucpname PCRE2_SUFFIX(get_ucpname_)
|
||||
#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_)
|
||||
#define print_char PCRE2_SUFFIX(print_char_)
|
||||
#define print_custring PCRE2_SUFFIX(print_custring_)
|
||||
#define print_custring_bylen PCRE2_SUFFIX(print_custring_bylen_)
|
||||
#define print_prop PCRE2_SUFFIX(print_prop_)
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h.
|
||||
The contents of the table are, however, mode-dependent. */
|
||||
|
||||
static const uint8_t OP_lengths[] = { OP_LENGTHS };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print one character from a string *
|
||||
*************************************************/
|
||||
|
||||
/* In UTF mode the character may occupy more than one code unit.
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
ptr pointer to first code unit of the character
|
||||
utf TRUE if string is UTF (will be FALSE if UTF is not supported)
|
||||
|
||||
Returns: number of additional code units used
|
||||
*/
|
||||
|
||||
static unsigned int
|
||||
print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
|
||||
{
|
||||
uint32_t c = *ptr;
|
||||
BOOL one_code_unit = !utf;
|
||||
|
||||
/* If UTF is supported and requested, check for a valid single code unit. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
one_code_unit = c < 0x80;
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
one_code_unit = (c & 0xfc00) != 0xd800;
|
||||
#else
|
||||
one_code_unit = (c & 0xfffff800u) != 0xd800u;
|
||||
#endif /* CODE_UNIT_WIDTH */
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* Handle a valid one-code-unit character at any width. */
|
||||
|
||||
if (one_code_unit)
|
||||
{
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
|
||||
else if (c < 0x80) fprintf(f, "\\x%02x", c);
|
||||
else fprintf(f, "\\x{%02x}", c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Code for invalid UTF code units and multi-unit UTF characters is different
|
||||
for each width. If UTF is not supported, control should never get here, but we
|
||||
need a return statement to keep the compiler happy. */
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
return 0;
|
||||
#else
|
||||
|
||||
/* Malformed UTF-8 should occur only if the sanity check has been turned off.
|
||||
Rather than swallow random bytes, just stop if we hit a bad one. Print it with
|
||||
\X instead of \x as an indication. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if ((c & 0xc0) != 0xc0)
|
||||
{
|
||||
fprintf(f, "\\X{%x}", c); /* Invalid starting byte */
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
int i;
|
||||
int a = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
|
||||
int s = 6*a;
|
||||
c = (c & PRIV(utf8_table3)[a]) << s;
|
||||
for (i = 1; i <= a; i++)
|
||||
{
|
||||
if ((ptr[i] & 0xc0) != 0x80)
|
||||
{
|
||||
fprintf(f, "\\X{%x}", c); /* Invalid secondary byte */
|
||||
return i - 1;
|
||||
}
|
||||
s -= 6;
|
||||
c |= (ptr[i] & 0x3f) << s;
|
||||
}
|
||||
fprintf(f, "\\x{%x}", c);
|
||||
return a;
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
|
||||
/* UTF-16: rather than swallow a low surrogate, just stop if we hit a bad one.
|
||||
Print it with \X instead of \x as an indication. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((ptr[1] & 0xfc00) != 0xdc00)
|
||||
{
|
||||
fprintf(f, "\\X{%x}", c);
|
||||
return 0;
|
||||
}
|
||||
c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
|
||||
fprintf(f, "\\x{%x}", c);
|
||||
return 1;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
|
||||
|
||||
/* For UTF-32 we get here only for a malformed code unit, which should only
|
||||
occur if the sanity check has been turned off. Print it with \X instead of \x
|
||||
as an indication. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
fprintf(f, "\\X{%x}", c);
|
||||
return 0;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print string as a list of code units *
|
||||
*************************************************/
|
||||
|
||||
/* These take no account of UTF as they always print each individual code unit.
|
||||
The string is zero-terminated for print_custring(); the length is given for
|
||||
print_custring_bylen().
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
ptr point to the string
|
||||
len length for print_custring_bylen()
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
print_custring(FILE *f, PCRE2_SPTR ptr)
|
||||
{
|
||||
while (*ptr != '\0')
|
||||
{
|
||||
uint32_t c = *ptr++;
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_custring_bylen(FILE *f, PCRE2_SPTR ptr, PCRE2_UCHAR len)
|
||||
{
|
||||
for (; len > 0; len--)
|
||||
{
|
||||
uint32_t c = *ptr++;
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find Unicode property name *
|
||||
*************************************************/
|
||||
|
||||
/* When there is no UTF/UCP support, the table of names does not exist. This
|
||||
function should not be called in such configurations, because a pattern that
|
||||
tries to use Unicode properties won't compile. Rather than put lots of #ifdefs
|
||||
into the main code, however, we just put one into this function.
|
||||
|
||||
Now that the table contains both full names and their abbreviations, we do some
|
||||
fiddling to try to get the full name, which is either the longer of two found
|
||||
names, or a 3-character script name. */
|
||||
|
||||
static const char *
|
||||
get_ucpname(unsigned int ptype, unsigned int pvalue)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
int count = 0;
|
||||
const char *yield = "??";
|
||||
size_t len = 0;
|
||||
unsigned int ptypex = (ptype == PT_SC)? PT_SCX : ptype;
|
||||
|
||||
for (int i = PRIV(utt_size) - 1; i >= 0; i--)
|
||||
{
|
||||
const ucp_type_table *u = PRIV(utt) + i;
|
||||
|
||||
if ((ptype == u->type || ptypex == u->type) && pvalue == u->value)
|
||||
{
|
||||
const char *s = PRIV(utt_names) + u->name_offset;
|
||||
size_t sl = strlen(s);
|
||||
|
||||
if (sl == 3 && (u->type == PT_SC || u->type == PT_SCX))
|
||||
{
|
||||
yield = s;
|
||||
break;
|
||||
}
|
||||
|
||||
if (sl > len)
|
||||
{
|
||||
yield = s;
|
||||
len = sl;
|
||||
}
|
||||
|
||||
if (++count >= 2) break;
|
||||
}
|
||||
}
|
||||
|
||||
return yield;
|
||||
|
||||
#else /* No UTF support */
|
||||
(void)ptype;
|
||||
(void)pvalue;
|
||||
return "??";
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property value *
|
||||
*************************************************/
|
||||
|
||||
/* "Normal" properties can be printed from tables. The PT_CLIST property is a
|
||||
pseudo-property that contains a pointer to a list of case-equivalent
|
||||
characters.
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
code pointer in the compiled code
|
||||
before text to print before
|
||||
after text to print after
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after)
|
||||
{
|
||||
if (code[1] != PT_CLIST)
|
||||
{
|
||||
const char *sc = (code[1] == PT_SC)? "script:" : "";
|
||||
const char *s = get_ucpname(code[1], code[2]);
|
||||
fprintf(f, "%s%s %s%c%s%s", before, OP_names[*code], sc, toupper(s[0]), s+1, after);
|
||||
}
|
||||
else
|
||||
{
|
||||
const char *not = (*code == OP_PROP)? "" : "not ";
|
||||
const uint32_t *p = PRIV(ucd_caseless_sets) + code[2];
|
||||
fprintf (f, "%s%sclist", before, not);
|
||||
while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
|
||||
fprintf(f, "%s", after);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print compiled pattern *
|
||||
*************************************************/
|
||||
|
||||
/* The print_lengths flag controls whether offsets and lengths of items are
|
||||
printed. Lenths can be turned off from pcre2test so that automatic tests on
|
||||
bytecode can be written that do not depend on the value of LINK_SIZE.
|
||||
|
||||
Arguments:
|
||||
re a compiled pattern
|
||||
f the file to write to
|
||||
print_lengths show various lengths
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths)
|
||||
{
|
||||
PCRE2_SPTR codestart, nametable, code;
|
||||
uint32_t nesize = re->name_entry_size;
|
||||
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||
|
||||
nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
|
||||
code = codestart = nametable + re->name_count * re->name_entry_size;
|
||||
|
||||
for(;;)
|
||||
{
|
||||
PCRE2_SPTR ccode;
|
||||
uint32_t c;
|
||||
int i;
|
||||
const char *flag = " ";
|
||||
unsigned int extra = 0;
|
||||
|
||||
if (print_lengths)
|
||||
fprintf(f, "%3d ", (int)(code - codestart));
|
||||
else
|
||||
fprintf(f, " ");
|
||||
|
||||
switch(*code)
|
||||
{
|
||||
/* ========================================================================== */
|
||||
/* These cases are never obeyed. This is a fudge that causes a compile-
|
||||
time error if the vectors OP_names or OP_lengths, which are indexed
|
||||
by opcode, are not the correct length. It seems to be the only way to do
|
||||
such a check at compile time, as the sizeof() operator does not work in
|
||||
the C preprocessor. */
|
||||
|
||||
case OP_TABLE_LENGTH:
|
||||
case OP_TABLE_LENGTH +
|
||||
((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
|
||||
(sizeof(OP_lengths) == OP_TABLE_LENGTH)):
|
||||
return;
|
||||
/* ========================================================================== */
|
||||
|
||||
case OP_END:
|
||||
fprintf(f, " %s\n", OP_names[*code]);
|
||||
fprintf(f, "------------------------------------------------------------------\n");
|
||||
return;
|
||||
|
||||
case OP_CHAR:
|
||||
fprintf(f, " ");
|
||||
do
|
||||
{
|
||||
code++;
|
||||
code += 1 + print_char(f, code, utf);
|
||||
}
|
||||
while (*code == OP_CHAR);
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CHARI:
|
||||
fprintf(f, " /i ");
|
||||
do
|
||||
{
|
||||
code++;
|
||||
code += 1 + print_char(f, code, utf);
|
||||
}
|
||||
while (*code == OP_CHARI);
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRA:
|
||||
case OP_SCBRAPOS:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_BRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRA:
|
||||
case OP_SBRAPOS:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRPOS:
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ASSERT_NA:
|
||||
case OP_ASSERTBACK_NA:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
case OP_REVERSE:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CLOSE:
|
||||
fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
|
||||
break;
|
||||
|
||||
case OP_CREF:
|
||||
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_DNCREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s Cond ref <", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_RREF:
|
||||
c = GET2(code, 1);
|
||||
if (c == RREF_ANY)
|
||||
fprintf(f, " Cond recurse any");
|
||||
else
|
||||
fprintf(f, " Cond recurse %d", c);
|
||||
break;
|
||||
|
||||
case OP_DNRREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s Cond recurse <", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_FALSE:
|
||||
fprintf(f, " Cond false");
|
||||
break;
|
||||
|
||||
case OP_TRUE:
|
||||
fprintf(f, " Cond true");
|
||||
break;
|
||||
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_POSSTARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_POSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSQUERY:
|
||||
fprintf(f, " %s ", flag);
|
||||
|
||||
if (*code >= OP_TYPESTAR)
|
||||
{
|
||||
if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
|
||||
{
|
||||
print_prop(f, code + 1, "", " ");
|
||||
extra = 2;
|
||||
}
|
||||
else fprintf(f, "%s", OP_names[code[1]]);
|
||||
}
|
||||
else extra = print_char(f, code+1, utf);
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_EXACTI:
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_POSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_EXACT:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
fprintf(f, " %s ", flag);
|
||||
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
|
||||
else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEPOSUPTO:
|
||||
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
||||
{
|
||||
print_prop(f, code + IMM2_SIZE + 1, " ", " ");
|
||||
extra = 2;
|
||||
}
|
||||
else fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_NOTI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_NOT:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1, utf);
|
||||
fprintf(f, "]");
|
||||
break;
|
||||
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTPOSQUERY:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1, utf);
|
||||
fprintf(f, "]%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTPOSUPTO:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
|
||||
fprintf(f, "]{");
|
||||
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
|
||||
else
|
||||
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_RECURSE:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_REFI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_REF:
|
||||
fprintf(f, " %s \\%d", flag, GET2(code,1));
|
||||
ccode = code + OP_lengths[*code];
|
||||
goto CLASS_REF_REPEAT;
|
||||
|
||||
case OP_DNREFI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_DNREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s \\k<", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
ccode = code + OP_lengths[*code];
|
||||
goto CLASS_REF_REPEAT;
|
||||
|
||||
case OP_CALLOUT:
|
||||
fprintf(f, " %s %d %d %d", OP_names[*code], code[1 + 2*LINK_SIZE],
|
||||
GET(code, 1), GET(code, 1 + LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_CALLOUT_STR:
|
||||
c = code[1 + 4*LINK_SIZE];
|
||||
fprintf(f, " %s %c", OP_names[*code], c);
|
||||
extra = GET(code, 1 + 2*LINK_SIZE);
|
||||
print_custring_bylen(f, code + 2 + 4*LINK_SIZE, extra - 3 - 4*LINK_SIZE);
|
||||
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
|
||||
if (c == PRIV(callout_start_delims)[i])
|
||||
{
|
||||
c = PRIV(callout_end_delims)[i];
|
||||
break;
|
||||
}
|
||||
fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1),
|
||||
GET(code, 1 + LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_PROP:
|
||||
case OP_NOTPROP:
|
||||
print_prop(f, code, " ", "");
|
||||
break;
|
||||
|
||||
/* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm
|
||||
in having this code always here, and it makes it less messy without all
|
||||
those #ifdefs. */
|
||||
|
||||
case OP_CLASS:
|
||||
case OP_NCLASS:
|
||||
case OP_XCLASS:
|
||||
{
|
||||
unsigned int min, max;
|
||||
BOOL printmap;
|
||||
BOOL invertmap = FALSE;
|
||||
uint8_t *map;
|
||||
uint8_t inverted_map[32];
|
||||
|
||||
fprintf(f, " [");
|
||||
|
||||
if (*code == OP_XCLASS)
|
||||
{
|
||||
extra = GET(code, 1);
|
||||
ccode = code + LINK_SIZE + 1;
|
||||
printmap = (*ccode & XCL_MAP) != 0;
|
||||
if ((*ccode & XCL_NOT) != 0)
|
||||
{
|
||||
invertmap = (*ccode & XCL_HASPROP) == 0;
|
||||
fprintf(f, "^");
|
||||
}
|
||||
ccode++;
|
||||
}
|
||||
else
|
||||
{
|
||||
printmap = TRUE;
|
||||
ccode = code + 1;
|
||||
}
|
||||
|
||||
/* Print a bit map */
|
||||
|
||||
if (printmap)
|
||||
{
|
||||
map = (uint8_t *)ccode;
|
||||
if (invertmap)
|
||||
{
|
||||
/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
|
||||
for (i = 0; i < 32; i++) inverted_map[i] = 255 ^ map[i];
|
||||
map = inverted_map;
|
||||
}
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
if ((map[i/8] & (1u << (i&7))) != 0)
|
||||
{
|
||||
int j;
|
||||
for (j = i+1; j < 256; j++)
|
||||
if ((map[j/8] & (1u << (j&7))) == 0) break;
|
||||
if (i == '-' || i == ']') fprintf(f, "\\");
|
||||
if (PRINTABLE(i)) fprintf(f, "%c", i);
|
||||
else fprintf(f, "\\x%02x", i);
|
||||
if (--j > i)
|
||||
{
|
||||
if (j != i + 1) fprintf(f, "-");
|
||||
if (j == '-' || j == ']') fprintf(f, "\\");
|
||||
if (PRINTABLE(j)) fprintf(f, "%c", j);
|
||||
else fprintf(f, "\\x%02x", j);
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
ccode += 32 / sizeof(PCRE2_UCHAR);
|
||||
}
|
||||
|
||||
/* For an XCLASS there is always some additional data */
|
||||
|
||||
if (*code == OP_XCLASS)
|
||||
{
|
||||
PCRE2_UCHAR ch;
|
||||
while ((ch = *ccode++) != XCL_END)
|
||||
{
|
||||
BOOL not = FALSE;
|
||||
const char *notch = "";
|
||||
|
||||
switch(ch)
|
||||
{
|
||||
case XCL_NOTPROP:
|
||||
not = TRUE;
|
||||
notch = "^";
|
||||
/* Fall through */
|
||||
|
||||
case XCL_PROP:
|
||||
{
|
||||
unsigned int ptype = *ccode++;
|
||||
unsigned int pvalue = *ccode++;
|
||||
const char *s;
|
||||
|
||||
switch(ptype)
|
||||
{
|
||||
case PT_PXGRAPH:
|
||||
fprintf(f, "[:%sgraph:]", notch);
|
||||
break;
|
||||
|
||||
case PT_PXPRINT:
|
||||
fprintf(f, "[:%sprint:]", notch);
|
||||
break;
|
||||
|
||||
case PT_PXPUNCT:
|
||||
fprintf(f, "[:%spunct:]", notch);
|
||||
break;
|
||||
|
||||
default:
|
||||
s = get_ucpname(ptype, pvalue);
|
||||
fprintf(f, "\\%c{%c%s}", (not? 'P':'p'), toupper(s[0]), s+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ccode += 1 + print_char(f, ccode, utf);
|
||||
if (ch == XCL_RANGE)
|
||||
{
|
||||
fprintf(f, "-");
|
||||
ccode += 1 + print_char(f, ccode, utf);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Indicate a non-UTF class which was created by negation */
|
||||
|
||||
fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
|
||||
|
||||
/* Handle repeats after a class or a back reference */
|
||||
|
||||
CLASS_REF_REPEAT:
|
||||
switch(*ccode)
|
||||
{
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSSTAR:
|
||||
case OP_CRPOSPLUS:
|
||||
case OP_CRPOSQUERY:
|
||||
fprintf(f, "%s", OP_names[*ccode]);
|
||||
extra += OP_lengths[*ccode];
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
case OP_CRPOSRANGE:
|
||||
min = GET2(ccode,1);
|
||||
max = GET2(ccode,1 + IMM2_SIZE);
|
||||
if (max == 0) fprintf(f, "{%u,}", min);
|
||||
else fprintf(f, "{%u,%u}", min, max);
|
||||
if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
|
||||
else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+");
|
||||
extra += OP_lengths[*ccode];
|
||||
break;
|
||||
|
||||
/* Do nothing if it's not a repeat; this code stops picky compilers
|
||||
warning about the lack of a default code path. */
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_MARK:
|
||||
case OP_COMMIT_ARG:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
fprintf(f, " %s ", OP_names[*code]);
|
||||
print_custring_bylen(f, code + 2, code[1]);
|
||||
extra += code[1];
|
||||
break;
|
||||
|
||||
case OP_THEN:
|
||||
fprintf(f, " %s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CIRCM:
|
||||
case OP_DOLLM:
|
||||
flag = "/m";
|
||||
/* Fall through */
|
||||
|
||||
/* Anything else is just an item with no data, but possibly a flag. */
|
||||
|
||||
default:
|
||||
fprintf(f, " %s %s", flag, OP_names[*code]);
|
||||
break;
|
||||
}
|
||||
|
||||
code += OP_lengths[*code] + extra;
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_printint.c */
|
344
third_party/pcre/pcre2_script_run.c
vendored
Normal file
344
third_party/pcre/pcre2_script_run.c
vendored
Normal file
|
@ -0,0 +1,344 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains the function for checking a script run. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check script run *
|
||||
*************************************************/
|
||||
|
||||
/* A script run is conceptually a sequence of characters all in the same
|
||||
Unicode script. However, it isn't quite that simple. There are special rules
|
||||
for scripts that are commonly used together, and also special rules for digits.
|
||||
This function implements the appropriate checks, which is possible only when
|
||||
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
|
||||
no Unicode support; however, it should never be called in that circumstance
|
||||
because an error is given by pcre2_compile() if a script run is called for in a
|
||||
version of PCRE2 compiled without Unicode support.
|
||||
|
||||
Arguments:
|
||||
pgr point to the first character
|
||||
endptr point after the last character
|
||||
utf TRUE if in UTF mode
|
||||
|
||||
Returns: TRUE if this is a valid script run
|
||||
*/
|
||||
|
||||
/* These are states in the checking process. */
|
||||
|
||||
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
|
||||
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
|
||||
SCRIPT_HANPENDING, /* Have had only Han characters */
|
||||
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
|
||||
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
|
||||
SCRIPT_HANHANGUL /* Expect Han or Hangul */
|
||||
};
|
||||
|
||||
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
|
||||
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
|
||||
|
||||
BOOL
|
||||
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
uint32_t require_state = SCRIPT_UNSET;
|
||||
uint32_t require_map[FULL_MAPSIZE];
|
||||
uint32_t map[FULL_MAPSIZE];
|
||||
uint32_t require_digitset = 0;
|
||||
uint32_t c;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
(void)utf; /* Avoid compiler warning */
|
||||
#endif
|
||||
|
||||
/* Any string containing fewer than 2 characters is a valid script run. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
if (ptr >= endptr) return TRUE;
|
||||
|
||||
/* Initialize the require map. This is a full-size bitmap that has a bit for
|
||||
every script, as opposed to the maps in ucd_script_sets, which only have bits
|
||||
for scripts less than ucp_Unknown - those that appear in script extension
|
||||
lists. */
|
||||
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
|
||||
|
||||
/* Scan strings of two or more characters, checking the Unicode characteristics
|
||||
of each code point. There is special code for scripts that can be combined with
|
||||
characters from the Han Chinese script. This may be used in conjunction with
|
||||
four other scripts in these combinations:
|
||||
|
||||
. Han with Hiragana and Katakana is allowed (for Japanese).
|
||||
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
||||
. Han with Hangul is allowed (for Korean).
|
||||
|
||||
If the first significant character's script is one of the four, the required
|
||||
script type is immediately known. However, if the first significant
|
||||
character's script is Han, we have to keep checking for a non-Han character.
|
||||
Hence the SCRIPT_HANPENDING state. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
const ucd_record *ucd = GET_UCD(c);
|
||||
uint32_t script = ucd->script;
|
||||
|
||||
/* If the script is Unknown, the string is not a valid script run. Such
|
||||
characters can only form script runs of length one (see test above). */
|
||||
|
||||
if (script == ucp_Unknown) return FALSE;
|
||||
|
||||
/* A character without any script extensions whose script is Inherited or
|
||||
Common is always accepted with any script. If there are extensions, the
|
||||
following processing happens for all scripts. */
|
||||
|
||||
if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
|
||||
{
|
||||
BOOL OK;
|
||||
|
||||
/* Set up a full-sized map for this character that can include bits for all
|
||||
scripts. Copy the scriptx map for this character (which covers those
|
||||
scripts that appear in script extension lists), set the remaining values to
|
||||
zero, and then, except for Common or Inherited, add this script's bit to
|
||||
the map. */
|
||||
|
||||
memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
|
||||
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
|
||||
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
|
||||
|
||||
/* Handle the different checking states */
|
||||
|
||||
switch(require_state)
|
||||
{
|
||||
/* First significant character - it might follow Common or Inherited
|
||||
characters that do not have any script extensions. */
|
||||
|
||||
case SCRIPT_UNSET:
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_state = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_state = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
default:
|
||||
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
|
||||
require_state = SCRIPT_MAP;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* The first significant character was Han. An inspection of the Unicode
|
||||
11.0.0 files shows that there are the following types of Script Extension
|
||||
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
|
||||
scripts:
|
||||
|
||||
. Bopomofo + Han
|
||||
. Han + Hiragana + Katakana
|
||||
. Hiragana + Katakana
|
||||
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
||||
|
||||
The following code tries to make sense of this. */
|
||||
|
||||
#define FOUND_BOPOMOFO 1
|
||||
#define FOUND_HIRAGANA 2
|
||||
#define FOUND_KATAKANA 4
|
||||
#define FOUND_HANGUL 8
|
||||
|
||||
case SCRIPT_HANPENDING:
|
||||
if (script != ucp_Han) /* Another Han does nothing */
|
||||
{
|
||||
uint32_t chspecial = 0;
|
||||
|
||||
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
|
||||
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
|
||||
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
|
||||
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
|
||||
|
||||
if (chspecial == 0) return FALSE; /* Not allowed with Han */
|
||||
|
||||
if (chspecial == FOUND_BOPOMOFO)
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
|
||||
/* Otherwise this character must be allowed with all of them, so remain
|
||||
in the pending state. */
|
||||
}
|
||||
break;
|
||||
|
||||
/* Previously encountered one of the "with Han" scripts. Check that
|
||||
this character is appropriate. */
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
|
||||
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
|
||||
break;
|
||||
|
||||
/* Previously encountered one or more characters that are allowed with a
|
||||
list of scripts. */
|
||||
|
||||
case SCRIPT_MAP:
|
||||
OK = FALSE;
|
||||
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++)
|
||||
{
|
||||
if ((require_map[i] & map[i]) != 0)
|
||||
{
|
||||
OK = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!OK) return FALSE;
|
||||
|
||||
/* The rest of the string must be in this script, but we have to
|
||||
allow for the Han complications. */
|
||||
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_state = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_state = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_state = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_state = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
/* Compute the intersection of the required list of scripts and the
|
||||
allowed scripts for this character. */
|
||||
|
||||
default:
|
||||
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
} /* End checking character's script and extensions. */
|
||||
|
||||
/* The character is in an acceptable script. We must now ensure that all
|
||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||
Common, Arabic) have more than one set of decimal digits. This code does
|
||||
not allow mixing sets, even within the same script. The vector called
|
||||
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
||||
following elements, and then, in ascending order, the code points of the
|
||||
'9' characters in every set of 10 digits. Each set is identified by the
|
||||
offset in the vector of its '9' character. An initial check of the first
|
||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||
|
||||
if (ucd->chartype == ucp_Nd)
|
||||
{
|
||||
uint32_t digitset;
|
||||
|
||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||
{
|
||||
int mid;
|
||||
int bot = 1;
|
||||
int top = PRIV(ucd_digit_sets)[0];
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot + 1) /* <= rather than == is paranoia */
|
||||
{
|
||||
digitset = top;
|
||||
break;
|
||||
}
|
||||
mid = (top + bot) / 2;
|
||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/* A required value of 0 means "unset". */
|
||||
|
||||
if (require_digitset == 0) require_digitset = digitset;
|
||||
else if (digitset != require_digitset) return FALSE;
|
||||
} /* End digit handling */
|
||||
|
||||
/* If we haven't yet got to the end, pick up the next character. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
} /* End checking loop */
|
||||
|
||||
#else /* NOT SUPPORT_UNICODE */
|
||||
(void)ptr;
|
||||
(void)endptr;
|
||||
(void)utf;
|
||||
return TRUE;
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
/* End of pcre2_script_run.c */
|
286
third_party/pcre/pcre2_serialize.c
vendored
Normal file
286
third_party/pcre/pcre2_serialize.c
vendored
Normal file
|
@ -0,0 +1,286 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains functions for serializing and deserializing
|
||||
a sequence of compiled codes. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
/* Magic number to provide a small check against being handed junk. */
|
||||
|
||||
#define SERIALIZED_DATA_MAGIC 0x50523253u
|
||||
|
||||
/* Deserialization is limited to the current PCRE version and
|
||||
character width. */
|
||||
|
||||
#define SERIALIZED_DATA_VERSION \
|
||||
((PCRE2_MAJOR) | ((PCRE2_MINOR) << 16))
|
||||
|
||||
#define SERIALIZED_DATA_CONFIG \
|
||||
(sizeof(PCRE2_UCHAR) | ((sizeof(void*)) << 8) | ((sizeof(PCRE2_SIZE)) << 16))
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Serialize compiled patterns *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
|
||||
pcre2_serialize_encode(const pcre2_code **codes, int32_t number_of_codes,
|
||||
uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size,
|
||||
pcre2_general_context *gcontext)
|
||||
{
|
||||
uint8_t *bytes;
|
||||
uint8_t *dst_bytes;
|
||||
int32_t i;
|
||||
PCRE2_SIZE total_size;
|
||||
const pcre2_real_code *re;
|
||||
const uint8_t *tables;
|
||||
pcre2_serialized_data *data;
|
||||
|
||||
const pcre2_memctl *memctl = (gcontext != NULL) ?
|
||||
&gcontext->memctl : &PRIV(default_compile_context).memctl;
|
||||
|
||||
if (codes == NULL || serialized_bytes == NULL || serialized_size == NULL)
|
||||
return PCRE2_ERROR_NULL;
|
||||
|
||||
if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
|
||||
|
||||
/* Compute total size. */
|
||||
total_size = sizeof(pcre2_serialized_data) + TABLES_LENGTH;
|
||||
tables = NULL;
|
||||
|
||||
for (i = 0; i < number_of_codes; i++)
|
||||
{
|
||||
if (codes[i] == NULL) return PCRE2_ERROR_NULL;
|
||||
re = (const pcre2_real_code *)(codes[i]);
|
||||
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
|
||||
if (tables == NULL)
|
||||
tables = re->tables;
|
||||
else if (tables != re->tables)
|
||||
return PCRE2_ERROR_MIXEDTABLES;
|
||||
total_size += re->blocksize;
|
||||
}
|
||||
|
||||
/* Initialize the byte stream. */
|
||||
bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data);
|
||||
if (bytes == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
/* The controller is stored as a hidden parameter. */
|
||||
memcpy(bytes, memctl, sizeof(pcre2_memctl));
|
||||
bytes += sizeof(pcre2_memctl);
|
||||
|
||||
data = (pcre2_serialized_data *)bytes;
|
||||
data->magic = SERIALIZED_DATA_MAGIC;
|
||||
data->version = SERIALIZED_DATA_VERSION;
|
||||
data->config = SERIALIZED_DATA_CONFIG;
|
||||
data->number_of_codes = number_of_codes;
|
||||
|
||||
/* Copy all compiled code data. */
|
||||
dst_bytes = bytes + sizeof(pcre2_serialized_data);
|
||||
memcpy(dst_bytes, tables, TABLES_LENGTH);
|
||||
dst_bytes += TABLES_LENGTH;
|
||||
|
||||
for (i = 0; i < number_of_codes; i++)
|
||||
{
|
||||
re = (const pcre2_real_code *)(codes[i]);
|
||||
(void)memcpy(dst_bytes, (char *)re, re->blocksize);
|
||||
|
||||
/* Certain fields in the compiled code block are re-set during
|
||||
deserialization. In order to ensure that the serialized data stream is always
|
||||
the same for the same pattern, set them to zero here. We can't assume the
|
||||
copy of the pattern is correctly aligned for accessing the fields as part of
|
||||
a structure. Note the use of sizeof(void *) in the second of these, to
|
||||
specify the size of a pointer. If sizeof(uint8_t *) is used (tables is a
|
||||
pointer to uint8_t), gcc gives a warning because the first argument is also a
|
||||
pointer to uint8_t. Casting the first argument to (void *) can stop this, but
|
||||
it didn't stop Coverity giving the same complaint. */
|
||||
|
||||
(void)memset(dst_bytes + offsetof(pcre2_real_code, memctl), 0,
|
||||
sizeof(pcre2_memctl));
|
||||
(void)memset(dst_bytes + offsetof(pcre2_real_code, tables), 0,
|
||||
sizeof(void *));
|
||||
(void)memset(dst_bytes + offsetof(pcre2_real_code, executable_jit), 0,
|
||||
sizeof(void *));
|
||||
|
||||
dst_bytes += re->blocksize;
|
||||
}
|
||||
|
||||
*serialized_bytes = bytes;
|
||||
*serialized_size = total_size;
|
||||
return number_of_codes;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Deserialize compiled patterns *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
|
||||
pcre2_serialize_decode(pcre2_code **codes, int32_t number_of_codes,
|
||||
const uint8_t *bytes, pcre2_general_context *gcontext)
|
||||
{
|
||||
const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes;
|
||||
const pcre2_memctl *memctl = (gcontext != NULL) ?
|
||||
&gcontext->memctl : &PRIV(default_compile_context).memctl;
|
||||
|
||||
const uint8_t *src_bytes;
|
||||
pcre2_real_code *dst_re;
|
||||
uint8_t *tables;
|
||||
int32_t i, j;
|
||||
|
||||
/* Sanity checks. */
|
||||
|
||||
if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL;
|
||||
if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA;
|
||||
if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA;
|
||||
if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC;
|
||||
if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE;
|
||||
if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE;
|
||||
|
||||
if (number_of_codes > data->number_of_codes)
|
||||
number_of_codes = data->number_of_codes;
|
||||
|
||||
src_bytes = bytes + sizeof(pcre2_serialized_data);
|
||||
|
||||
/* Decode tables. The reference count for the tables is stored immediately
|
||||
following them. */
|
||||
|
||||
tables = memctl->malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), memctl->memory_data);
|
||||
if (tables == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
memcpy(tables, src_bytes, TABLES_LENGTH);
|
||||
*(PCRE2_SIZE *)(tables + TABLES_LENGTH) = number_of_codes;
|
||||
src_bytes += TABLES_LENGTH;
|
||||
|
||||
/* Decode the byte stream. We must not try to read the size from the compiled
|
||||
code block in the stream, because it might be unaligned, which causes errors on
|
||||
hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type
|
||||
of the blocksize field is given its own name to ensure that it is the same here
|
||||
as in the block. */
|
||||
|
||||
for (i = 0; i < number_of_codes; i++)
|
||||
{
|
||||
CODE_BLOCKSIZE_TYPE blocksize;
|
||||
memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize),
|
||||
sizeof(CODE_BLOCKSIZE_TYPE));
|
||||
if (blocksize <= sizeof(pcre2_real_code))
|
||||
return PCRE2_ERROR_BADSERIALIZEDDATA;
|
||||
|
||||
/* The allocator provided by gcontext replaces the original one. */
|
||||
|
||||
dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize,
|
||||
(pcre2_memctl *)gcontext);
|
||||
if (dst_re == NULL)
|
||||
{
|
||||
memctl->free(tables, memctl->memory_data);
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
memctl->free(codes[j], memctl->memory_data);
|
||||
codes[j] = NULL;
|
||||
}
|
||||
return PCRE2_ERROR_NOMEMORY;
|
||||
}
|
||||
|
||||
/* The new allocator must be preserved. */
|
||||
|
||||
memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl),
|
||||
src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl));
|
||||
if (dst_re->magic_number != MAGIC_NUMBER ||
|
||||
dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 ||
|
||||
dst_re->name_count > MAX_NAME_COUNT)
|
||||
{
|
||||
memctl->free(dst_re, memctl->memory_data);
|
||||
return PCRE2_ERROR_BADSERIALIZEDDATA;
|
||||
}
|
||||
|
||||
/* At the moment only one table is supported. */
|
||||
|
||||
dst_re->tables = tables;
|
||||
dst_re->executable_jit = NULL;
|
||||
dst_re->flags |= PCRE2_DEREF_TABLES;
|
||||
|
||||
codes[i] = dst_re;
|
||||
src_bytes += blocksize;
|
||||
}
|
||||
|
||||
return number_of_codes;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get the number of serialized patterns *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN int32_t PCRE2_CALL_CONVENTION
|
||||
pcre2_serialize_get_number_of_codes(const uint8_t *bytes)
|
||||
{
|
||||
const pcre2_serialized_data *data = (const pcre2_serialized_data *)bytes;
|
||||
|
||||
if (data == NULL) return PCRE2_ERROR_NULL;
|
||||
if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC;
|
||||
if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE;
|
||||
if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE;
|
||||
|
||||
return data->number_of_codes;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free the allocated stream *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_serialize_free(uint8_t *bytes)
|
||||
{
|
||||
if (bytes != NULL)
|
||||
{
|
||||
pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl));
|
||||
memctl->free(memctl, memctl->memory_data);
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_serialize.c */
|
237
third_party/pcre/pcre2_string_utils.c
vendored
Normal file
237
third_party/pcre/pcre2_string_utils.c
vendored
Normal file
|
@ -0,0 +1,237 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2018-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains internal functions for comparing and finding the length
|
||||
of strings. These are used instead of strcmp() etc because the standard
|
||||
functions work only on 8-bit data. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Emulated memmove() for systems without it *
|
||||
*************************************************/
|
||||
|
||||
/* This function can make use of bcopy() if it is available. Otherwise do it by
|
||||
steam, as there some non-Unix environments that lack both memmove() and
|
||||
bcopy(). */
|
||||
|
||||
#if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE)
|
||||
void *
|
||||
PRIV(memmove)(void *d, const void *s, size_t n)
|
||||
{
|
||||
#ifdef HAVE_BCOPY
|
||||
bcopy(s, d, n);
|
||||
return d;
|
||||
#else
|
||||
size_t i;
|
||||
unsigned char *dest = (unsigned char *)d;
|
||||
const unsigned char *src = (const unsigned char *)s;
|
||||
if (dest > src)
|
||||
{
|
||||
dest += n;
|
||||
src += n;
|
||||
for (i = 0; i < n; ++i) *(--dest) = *(--src);
|
||||
return (void *)dest;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < n; ++i) *dest++ = *src++;
|
||||
return (void *)(dest - n);
|
||||
}
|
||||
#endif /* not HAVE_BCOPY */
|
||||
}
|
||||
#endif /* not VPCOMPAT && not HAVE_MEMMOVE */
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compare two zero-terminated PCRE2 strings *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
str1 first string
|
||||
str2 second string
|
||||
|
||||
Returns: 0, 1, or -1
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2)
|
||||
{
|
||||
PCRE2_UCHAR c1, c2;
|
||||
while (*str1 != '\0' || *str2 != '\0')
|
||||
{
|
||||
c1 = *str1++;
|
||||
c2 = *str2++;
|
||||
if (c1 != c2) return ((c1 > c2) << 1) - 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compare zero-terminated PCRE2 & 8-bit strings *
|
||||
*************************************************/
|
||||
|
||||
/* As the 8-bit string is almost always a literal, its type is specified as
|
||||
const char *.
|
||||
|
||||
Arguments:
|
||||
str1 first string
|
||||
str2 second string
|
||||
|
||||
Returns: 0, 1, or -1
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2)
|
||||
{
|
||||
PCRE2_UCHAR c1, c2;
|
||||
while (*str1 != '\0' || *str2 != '\0')
|
||||
{
|
||||
c1 = *str1++;
|
||||
c2 = *str2++;
|
||||
if (c1 != c2) return ((c1 > c2) << 1) - 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compare two PCRE2 strings, given a length *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
str1 first string
|
||||
str2 second string
|
||||
len the length
|
||||
|
||||
Returns: 0, 1, or -1
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len)
|
||||
{
|
||||
PCRE2_UCHAR c1, c2;
|
||||
for (; len > 0; len--)
|
||||
{
|
||||
c1 = *str1++;
|
||||
c2 = *str2++;
|
||||
if (c1 != c2) return ((c1 > c2) << 1) - 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compare PCRE2 string to 8-bit string by length *
|
||||
*************************************************/
|
||||
|
||||
/* As the 8-bit string is almost always a literal, its type is specified as
|
||||
const char *.
|
||||
|
||||
Arguments:
|
||||
str1 first string
|
||||
str2 second string
|
||||
len the length
|
||||
|
||||
Returns: 0, 1, or -1
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len)
|
||||
{
|
||||
PCRE2_UCHAR c1, c2;
|
||||
for (; len > 0; len--)
|
||||
{
|
||||
c1 = *str1++;
|
||||
c2 = *str2++;
|
||||
if (c1 != c2) return ((c1 > c2) << 1) - 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find the length of a PCRE2 string *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Argument: the string
|
||||
Returns: the length
|
||||
*/
|
||||
|
||||
PCRE2_SIZE
|
||||
PRIV(strlen)(PCRE2_SPTR str)
|
||||
{
|
||||
PCRE2_SIZE c = 0;
|
||||
while (*str++ != 0) c++;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Copy 8-bit 0-terminated string to PCRE2 string *
|
||||
*************************************************/
|
||||
|
||||
/* Arguments:
|
||||
str1 buffer to receive the string
|
||||
str2 8-bit string to be copied
|
||||
|
||||
Returns: the number of code units used (excluding trailing zero)
|
||||
*/
|
||||
|
||||
PCRE2_SIZE
|
||||
PRIV(strcpy_c8)(PCRE2_UCHAR *str1, const char *str2)
|
||||
{
|
||||
PCRE2_UCHAR *t = str1;
|
||||
while (*str2 != 0) *t++ = *str2++;
|
||||
*t = 0;
|
||||
return t - str1;
|
||||
}
|
||||
|
||||
/* End of pcre2_string_utils.c */
|
1825
third_party/pcre/pcre2_study.c
vendored
Normal file
1825
third_party/pcre/pcre2_study.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
1009
third_party/pcre/pcre2_substitute.c
vendored
Normal file
1009
third_party/pcre/pcre2_substitute.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
547
third_party/pcre/pcre2_substring.c
vendored
Normal file
547
third_party/pcre/pcre2_substring.c
vendored
Normal file
|
@ -0,0 +1,547 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Copy named captured string to given buffer *
|
||||
*************************************************/
|
||||
|
||||
/* This function copies a single captured substring into a given buffer,
|
||||
identifying it by name. If the regex permits duplicate names, the first
|
||||
substring that is set is chosen.
|
||||
|
||||
Arguments:
|
||||
match_data points to the match data
|
||||
stringname the name of the required substring
|
||||
buffer where to put the substring
|
||||
sizeptr the size of the buffer, updated to the size of the substring
|
||||
|
||||
Returns: if successful: zero
|
||||
if not successful, a negative error code:
|
||||
(1) an error from nametable_scan()
|
||||
(2) an error from copy_bynumber()
|
||||
(3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector
|
||||
(4) PCRE2_ERROR_UNSET: all named groups in ovector are unset
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname,
|
||||
PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int failrc, entrysize;
|
||||
if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
|
||||
return PCRE2_ERROR_DFA_UFUNC;
|
||||
entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize < 0) return entrysize;
|
||||
failrc = PCRE2_ERROR_UNAVAILABLE;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint32_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount)
|
||||
{
|
||||
if (match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_copy_bynumber(match_data, n, buffer, sizeptr);
|
||||
failrc = PCRE2_ERROR_UNSET;
|
||||
}
|
||||
}
|
||||
return failrc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Copy numbered captured string to given buffer *
|
||||
*************************************************/
|
||||
|
||||
/* This function copies a single captured substring into a given buffer,
|
||||
identifying it by number.
|
||||
|
||||
Arguments:
|
||||
match_data points to the match data
|
||||
stringnumber the number of the required substring
|
||||
buffer where to put the substring
|
||||
sizeptr the size of the buffer, updated to the size of the substring
|
||||
|
||||
Returns: if successful: 0
|
||||
if not successful, a negative error code:
|
||||
PCRE2_ERROR_NOMEMORY: buffer too small
|
||||
PCRE2_ERROR_NOSUBSTRING: no such substring
|
||||
PCRE2_ERROR_UNAVAILABLE: ovector too small
|
||||
PCRE2_ERROR_UNSET: substring is not set
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
|
||||
uint32_t stringnumber, PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
int rc;
|
||||
PCRE2_SIZE size;
|
||||
rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size);
|
||||
if (rc < 0) return rc;
|
||||
if (size + 1 > *sizeptr) return PCRE2_ERROR_NOMEMORY;
|
||||
memcpy(buffer, match_data->subject + match_data->ovector[stringnumber*2],
|
||||
CU2BYTES(size));
|
||||
buffer[size] = 0;
|
||||
*sizeptr = size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Extract named captured string *
|
||||
*************************************************/
|
||||
|
||||
/* This function copies a single captured substring, identified by name, into
|
||||
new memory. If the regex permits duplicate names, the first substring that is
|
||||
set is chosen.
|
||||
|
||||
Arguments:
|
||||
match_data pointer to match_data
|
||||
stringname the name of the required substring
|
||||
stringptr where to put the pointer to the new memory
|
||||
sizeptr where to put the length of the substring
|
||||
|
||||
Returns: if successful: zero
|
||||
if not successful, a negative value:
|
||||
(1) an error from nametable_scan()
|
||||
(2) an error from get_bynumber()
|
||||
(3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector
|
||||
(4) PCRE2_ERROR_UNSET: all named groups in ovector are unset
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_get_byname(pcre2_match_data *match_data,
|
||||
PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int failrc, entrysize;
|
||||
if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
|
||||
return PCRE2_ERROR_DFA_UFUNC;
|
||||
entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize < 0) return entrysize;
|
||||
failrc = PCRE2_ERROR_UNAVAILABLE;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint32_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount)
|
||||
{
|
||||
if (match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_get_bynumber(match_data, n, stringptr, sizeptr);
|
||||
failrc = PCRE2_ERROR_UNSET;
|
||||
}
|
||||
}
|
||||
return failrc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Extract captured string to new memory *
|
||||
*************************************************/
|
||||
|
||||
/* This function copies a single captured substring into a piece of new
|
||||
memory.
|
||||
|
||||
Arguments:
|
||||
match_data points to match data
|
||||
stringnumber the number of the required substring
|
||||
stringptr where to put a pointer to the new memory
|
||||
sizeptr where to put the size of the substring
|
||||
|
||||
Returns: if successful: 0
|
||||
if not successful, a negative error code:
|
||||
PCRE2_ERROR_NOMEMORY: failed to get memory
|
||||
PCRE2_ERROR_NOSUBSTRING: no such substring
|
||||
PCRE2_ERROR_UNAVAILABLE: ovector too small
|
||||
PCRE2_ERROR_UNSET: substring is not set
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_get_bynumber(pcre2_match_data *match_data,
|
||||
uint32_t stringnumber, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
int rc;
|
||||
PCRE2_SIZE size;
|
||||
PCRE2_UCHAR *yield;
|
||||
rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size);
|
||||
if (rc < 0) return rc;
|
||||
yield = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
|
||||
(size + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)match_data);
|
||||
if (yield == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
yield = (PCRE2_UCHAR *)(((char *)yield) + sizeof(pcre2_memctl));
|
||||
memcpy(yield, match_data->subject + match_data->ovector[stringnumber*2],
|
||||
CU2BYTES(size));
|
||||
yield[size] = 0;
|
||||
*stringptr = yield;
|
||||
*sizeptr = size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free memory obtained by get_substring *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Argument: the result of a previous pcre2_substring_get_byxxx()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_free(PCRE2_UCHAR *string)
|
||||
{
|
||||
if (string != NULL)
|
||||
{
|
||||
pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl));
|
||||
memctl->free(memctl, memctl->memory_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get length of a named substring *
|
||||
*************************************************/
|
||||
|
||||
/* This function returns the length of a named captured substring. If the regex
|
||||
permits duplicate names, the first substring that is set is chosen.
|
||||
|
||||
Arguments:
|
||||
match_data pointer to match data
|
||||
stringname the name of the required substring
|
||||
sizeptr where to put the length
|
||||
|
||||
Returns: 0 if successful, else a negative error number
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_length_byname(pcre2_match_data *match_data,
|
||||
PCRE2_SPTR stringname, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int failrc, entrysize;
|
||||
if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
|
||||
return PCRE2_ERROR_DFA_UFUNC;
|
||||
entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize < 0) return entrysize;
|
||||
failrc = PCRE2_ERROR_UNAVAILABLE;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint32_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount)
|
||||
{
|
||||
if (match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_length_bynumber(match_data, n, sizeptr);
|
||||
failrc = PCRE2_ERROR_UNSET;
|
||||
}
|
||||
}
|
||||
return failrc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get length of a numbered substring *
|
||||
*************************************************/
|
||||
|
||||
/* This function returns the length of a captured substring. If the start is
|
||||
beyond the end (which can happen when \K is used in an assertion), it sets the
|
||||
length to zero.
|
||||
|
||||
Arguments:
|
||||
match_data pointer to match data
|
||||
stringnumber the number of the required substring
|
||||
sizeptr where to put the length, if not NULL
|
||||
|
||||
Returns: if successful: 0
|
||||
if not successful, a negative error code:
|
||||
PCRE2_ERROR_NOSUBSTRING: no such substring
|
||||
PCRE2_ERROR_UNAVAILABLE: ovector is too small
|
||||
PCRE2_ERROR_UNSET: substring is not set
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_length_bynumber(pcre2_match_data *match_data,
|
||||
uint32_t stringnumber, PCRE2_SIZE *sizeptr)
|
||||
{
|
||||
PCRE2_SIZE left, right;
|
||||
int count = match_data->rc;
|
||||
if (count == PCRE2_ERROR_PARTIAL)
|
||||
{
|
||||
if (stringnumber > 0) return PCRE2_ERROR_PARTIAL;
|
||||
count = 0;
|
||||
}
|
||||
else if (count < 0) return count; /* Match failed */
|
||||
|
||||
if (match_data->matchedby != PCRE2_MATCHEDBY_DFA_INTERPRETER)
|
||||
{
|
||||
if (stringnumber > match_data->code->top_bracket)
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
if (stringnumber >= match_data->oveccount)
|
||||
return PCRE2_ERROR_UNAVAILABLE;
|
||||
if (match_data->ovector[stringnumber*2] == PCRE2_UNSET)
|
||||
return PCRE2_ERROR_UNSET;
|
||||
}
|
||||
else /* Matched using pcre2_dfa_match() */
|
||||
{
|
||||
if (stringnumber >= match_data->oveccount) return PCRE2_ERROR_UNAVAILABLE;
|
||||
if (count != 0 && stringnumber >= (uint32_t)count) return PCRE2_ERROR_UNSET;
|
||||
}
|
||||
|
||||
left = match_data->ovector[stringnumber*2];
|
||||
right = match_data->ovector[stringnumber*2+1];
|
||||
if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Extract all captured strings to new memory *
|
||||
*************************************************/
|
||||
|
||||
/* This function gets one chunk of memory and builds a list of pointers and all
|
||||
the captured substrings in it. A NULL pointer is put on the end of the list.
|
||||
The substrings are zero-terminated, but also, if the final argument is
|
||||
non-NULL, a list of lengths is also returned. This allows binary data to be
|
||||
handled.
|
||||
|
||||
Arguments:
|
||||
match_data points to the match data
|
||||
listptr set to point to the list of pointers
|
||||
lengthsptr set to point to the list of lengths (may be NULL)
|
||||
|
||||
Returns: if successful: 0
|
||||
if not successful, a negative error code:
|
||||
PCRE2_ERROR_NOMEMORY: failed to get memory,
|
||||
or a match failure code
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr,
|
||||
PCRE2_SIZE **lengthsptr)
|
||||
{
|
||||
int i, count, count2;
|
||||
PCRE2_SIZE size;
|
||||
PCRE2_SIZE *lensp;
|
||||
pcre2_memctl *memp;
|
||||
PCRE2_UCHAR **listp;
|
||||
PCRE2_UCHAR *sp;
|
||||
PCRE2_SIZE *ovector;
|
||||
|
||||
if ((count = match_data->rc) < 0) return count; /* Match failed */
|
||||
if (count == 0) count = match_data->oveccount; /* Ovector too small */
|
||||
|
||||
count2 = 2*count;
|
||||
ovector = match_data->ovector;
|
||||
size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *); /* For final NULL */
|
||||
if (lengthsptr != NULL) size += sizeof(PCRE2_SIZE) * count; /* For lengths */
|
||||
|
||||
for (i = 0; i < count2; i += 2)
|
||||
{
|
||||
size += sizeof(PCRE2_UCHAR *) + CU2BYTES(1);
|
||||
if (ovector[i+1] > ovector[i]) size += CU2BYTES(ovector[i+1] - ovector[i]);
|
||||
}
|
||||
|
||||
memp = PRIV(memctl_malloc)(size, (pcre2_memctl *)match_data);
|
||||
if (memp == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
*listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl));
|
||||
lensp = (PCRE2_SIZE *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1));
|
||||
|
||||
if (lengthsptr == NULL)
|
||||
{
|
||||
sp = (PCRE2_UCHAR *)lensp;
|
||||
lensp = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
*lengthsptr = lensp;
|
||||
sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(PCRE2_SIZE) * count);
|
||||
}
|
||||
|
||||
for (i = 0; i < count2; i += 2)
|
||||
{
|
||||
size = (ovector[i+1] > ovector[i])? (ovector[i+1] - ovector[i]) : 0;
|
||||
|
||||
/* Size == 0 includes the case when the capture is unset. Avoid adding
|
||||
PCRE2_UNSET to match_data->subject because it overflows, even though with
|
||||
zero size calling memcpy() is harmless. */
|
||||
|
||||
if (size != 0) memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size));
|
||||
*listp++ = sp;
|
||||
if (lensp != NULL) *lensp++ = size;
|
||||
sp += size;
|
||||
*sp++ = 0;
|
||||
}
|
||||
|
||||
*listp = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free memory obtained by substring_list_get *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Argument: the result of a previous pcre2_substring_list_get()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_free(PCRE2_SPTR *list)
|
||||
{
|
||||
if (list != NULL)
|
||||
{
|
||||
pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl));
|
||||
memctl->free(memctl, memctl->memory_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find (multiple) entries for named string *
|
||||
*************************************************/
|
||||
|
||||
/* This function scans the nametable for a given name, using binary chop. It
|
||||
returns either two pointers to the entries in the table, or, if no pointers are
|
||||
given, the number of a unique group with the given name. If duplicate names are
|
||||
permitted, and the name is not unique, an error is generated.
|
||||
|
||||
Arguments:
|
||||
code the compiled regex
|
||||
stringname the name whose entries required
|
||||
firstptr where to put the pointer to the first entry
|
||||
lastptr where to put the pointer to the last entry
|
||||
|
||||
Returns: PCRE2_ERROR_NOSUBSTRING if the name is not found
|
||||
otherwise, if firstptr and lastptr are NULL:
|
||||
a group number for a unique substring
|
||||
else PCRE2_ERROR_NOUNIQUESUBSTRING
|
||||
otherwise:
|
||||
the length of each entry, having set firstptr and lastptr
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname,
|
||||
PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr)
|
||||
{
|
||||
uint16_t bot = 0;
|
||||
uint16_t top = code->name_count;
|
||||
uint16_t entrysize = code->name_entry_size;
|
||||
PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code));
|
||||
|
||||
while (top > bot)
|
||||
{
|
||||
uint16_t mid = (top + bot) / 2;
|
||||
PCRE2_SPTR entry = nametable + entrysize*mid;
|
||||
int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE);
|
||||
if (c == 0)
|
||||
{
|
||||
PCRE2_SPTR first;
|
||||
PCRE2_SPTR last;
|
||||
PCRE2_SPTR lastentry;
|
||||
lastentry = nametable + entrysize * (code->name_count - 1);
|
||||
first = last = entry;
|
||||
while (first > nametable)
|
||||
{
|
||||
if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break;
|
||||
first -= entrysize;
|
||||
}
|
||||
while (last < lastentry)
|
||||
{
|
||||
if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break;
|
||||
last += entrysize;
|
||||
}
|
||||
if (firstptr == NULL) return (first == last)?
|
||||
(int)GET2(entry, 0) : PCRE2_ERROR_NOUNIQUESUBSTRING;
|
||||
*firstptr = first;
|
||||
*lastptr = last;
|
||||
return entrysize;
|
||||
}
|
||||
if (c > 0) bot = mid + 1; else top = mid;
|
||||
}
|
||||
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find number for named string *
|
||||
*************************************************/
|
||||
|
||||
/* This function is a convenience wrapper for pcre2_substring_nametable_scan()
|
||||
when it is known that names are unique. If there are duplicate names, it is not
|
||||
defined which number is returned.
|
||||
|
||||
Arguments:
|
||||
code the compiled regex
|
||||
stringname the name whose number is required
|
||||
|
||||
Returns: the number of the named parenthesis, or a negative number
|
||||
PCRE2_ERROR_NOSUBSTRING if not found
|
||||
PCRE2_ERROR_NOUNIQUESUBSTRING if not unique
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_number_from_name(const pcre2_code *code,
|
||||
PCRE2_SPTR stringname)
|
||||
{
|
||||
return pcre2_substring_nametable_scan(code, stringname, NULL, NULL);
|
||||
}
|
||||
|
||||
/* End of pcre2_substring.c */
|
234
third_party/pcre/pcre2_tables.c
vendored
Normal file
234
third_party/pcre/pcre2_tables.c
vendored
Normal file
|
@ -0,0 +1,234 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains some fixed tables that are used by more than one of the
|
||||
PCRE2 code modules. The tables are also #included by the pcre2test program,
|
||||
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
|
||||
avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is
|
||||
defined. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST /* We're compiling the library */
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h.
|
||||
This is mode-dependent, so it is skipped when this file is included by
|
||||
pcre2test. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
|
||||
#endif
|
||||
|
||||
/* Tables of horizontal and vertical whitespace characters, suitable for
|
||||
adding to classes. */
|
||||
|
||||
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
|
||||
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
|
||||
|
||||
/* These tables are the pairs of delimiters that are valid for callout string
|
||||
arguments. For each starting delimiter there must be a matching ending
|
||||
delimiter, which in fact is different only for bracket-like delimiters. */
|
||||
|
||||
const uint32_t PRIV(callout_start_delims)[] = {
|
||||
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
|
||||
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
|
||||
CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 };
|
||||
|
||||
const uint32_t PRIV(callout_end_delims[]) = {
|
||||
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
|
||||
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
|
||||
CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 };
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Tables for UTF-8 support *
|
||||
*************************************************/
|
||||
|
||||
/* These tables are required by pcre2test in 16- or 32-bit mode, as well
|
||||
as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for
|
||||
handling wide characters. */
|
||||
|
||||
#if defined PCRE2_PCRE2TEST || \
|
||||
(defined SUPPORT_UNICODE && \
|
||||
defined PCRE2_CODE_UNIT_WIDTH && \
|
||||
PCRE2_CODE_UNIT_WIDTH == 8)
|
||||
|
||||
/* These are the breakpoints for different numbers of bytes in a UTF-8
|
||||
character. */
|
||||
|
||||
const int PRIV(utf8_table1)[] =
|
||||
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
|
||||
|
||||
const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int);
|
||||
|
||||
/* These are the indicator bits and the mask for the data bits to set in the
|
||||
first byte of a character, indexed by the number of additional bytes. */
|
||||
|
||||
const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
/* Table of the number of extra bytes, indexed by the first byte masked with
|
||||
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
|
||||
|
||||
const uint8_t PRIV(utf8_table4)[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
#endif /* UTF-8 support needed */
|
||||
|
||||
/* Tables concerned with Unicode properties are relevant only when Unicode
|
||||
support is enabled. See also the pcre2_ucptables.c file, which is generated by
|
||||
a Python script from Unicode data files. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
const uint32_t PRIV(ucp_gentype)[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
|
||||
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
|
||||
ucp_P, ucp_P, /* Ps, Po */
|
||||
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
/* This table encodes the rules for finding the end of an extended grapheme
|
||||
cluster. Every code point has a grapheme break property which is one of the
|
||||
ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions
|
||||
10 and 11. The 2-dimensional table is indexed by the properties of two adjacent
|
||||
code points. The left property selects a word from the table, and the right
|
||||
property selects a bit from that word like this:
|
||||
|
||||
PRIV(ucp_gbtable)[left-property] & (1u << right-property)
|
||||
|
||||
The value is non-zero if a grapheme break is NOT permitted between the relevant
|
||||
two code points. The breaking rules are as follows:
|
||||
|
||||
1. Break at the start and end of text (pretty obviously).
|
||||
|
||||
2. Do not break between a CR and LF; otherwise, break before and after
|
||||
controls.
|
||||
|
||||
3. Do not break Hangul syllable sequences, the rules for which are:
|
||||
|
||||
L may be followed by L, V, LV or LVT
|
||||
LV or V may be followed by V or T
|
||||
LVT or T may be followed by T
|
||||
|
||||
4. Do not break before extending characters or zero-width-joiner (ZWJ).
|
||||
|
||||
The following rules are only for extended grapheme clusters (but that's what we
|
||||
are implementing).
|
||||
|
||||
5. Do not break before SpacingMarks.
|
||||
|
||||
6. Do not break after Prepend characters.
|
||||
|
||||
7. Do not break within emoji modifier sequences or emoji zwj sequences. That
|
||||
is, do not break between characters with the Extended_Pictographic property.
|
||||
Extend and ZWJ characters are allowed between the characters; this cannot be
|
||||
represented in this table, the code has to deal with it.
|
||||
|
||||
8. Do not break within emoji flag sequences. That is, do not break between
|
||||
regional indicator (RI) symbols if there are an odd number of RI characters
|
||||
before the break point. This table encodes "join RI characters"; the code
|
||||
has to deal with checking for previous adjoining RIs.
|
||||
|
||||
9. Otherwise, break everywhere.
|
||||
*/
|
||||
|
||||
#define ESZ (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbZWJ)
|
||||
|
||||
const uint32_t PRIV(ucp_gbtable)[] = {
|
||||
(1u<<ucp_gbLF), /* 0 CR */
|
||||
0, /* 1 LF */
|
||||
0, /* 2 Control */
|
||||
ESZ, /* 3 Extend */
|
||||
ESZ|(1u<<ucp_gbPrepend)| /* 4 Prepend */
|
||||
(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbT)|
|
||||
(1u<<ucp_gbLV)|(1u<<ucp_gbLVT)|(1u<<ucp_gbOther)|
|
||||
(1u<<ucp_gbRegional_Indicator),
|
||||
ESZ, /* 5 SpacingMark */
|
||||
ESZ|(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbLV)| /* 6 L */
|
||||
(1u<<ucp_gbLVT),
|
||||
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 7 V */
|
||||
ESZ|(1u<<ucp_gbT), /* 8 T */
|
||||
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 9 LV */
|
||||
ESZ|(1u<<ucp_gbT), /* 10 LVT */
|
||||
(1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */
|
||||
ESZ, /* 12 Other */
|
||||
ESZ, /* 13 ZWJ */
|
||||
ESZ|(1u<<ucp_gbExtended_Pictographic) /* 14 Extended Pictographic */
|
||||
};
|
||||
|
||||
#undef ESZ
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
/* This table reverses PRIV(ucp_gentype). We can save the cost
|
||||
of a memory load. */
|
||||
|
||||
const int PRIV(ucp_typerange)[] = {
|
||||
ucp_Cc, ucp_Cs,
|
||||
ucp_Ll, ucp_Lu,
|
||||
ucp_Mc, ucp_Mn,
|
||||
ucp_Nd, ucp_No,
|
||||
ucp_Pc, ucp_Ps,
|
||||
ucp_Sc, ucp_So,
|
||||
ucp_Zl, ucp_Zs,
|
||||
};
|
||||
#endif /* SUPPORT_JIT */
|
||||
|
||||
/* Finally, include the tables that are auto-generated from the Unicode data
|
||||
files. */
|
||||
|
||||
#include "pcre2_ucptables.inc"
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_tables.c */
|
234
third_party/pcre/pcre2_tables.inc
vendored
Normal file
234
third_party/pcre/pcre2_tables.inc
vendored
Normal file
|
@ -0,0 +1,234 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains some fixed tables that are used by more than one of the
|
||||
PCRE2 code modules. The tables are also #included by the pcre2test program,
|
||||
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
|
||||
avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is
|
||||
defined. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST /* We're compiling the library */
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h.
|
||||
This is mode-dependent, so it is skipped when this file is included by
|
||||
pcre2test. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
|
||||
#endif
|
||||
|
||||
/* Tables of horizontal and vertical whitespace characters, suitable for
|
||||
adding to classes. */
|
||||
|
||||
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
|
||||
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
|
||||
|
||||
/* These tables are the pairs of delimiters that are valid for callout string
|
||||
arguments. For each starting delimiter there must be a matching ending
|
||||
delimiter, which in fact is different only for bracket-like delimiters. */
|
||||
|
||||
const uint32_t PRIV(callout_start_delims)[] = {
|
||||
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
|
||||
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
|
||||
CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 };
|
||||
|
||||
const uint32_t PRIV(callout_end_delims[]) = {
|
||||
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
|
||||
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
|
||||
CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 };
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Tables for UTF-8 support *
|
||||
*************************************************/
|
||||
|
||||
/* These tables are required by pcre2test in 16- or 32-bit mode, as well
|
||||
as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for
|
||||
handling wide characters. */
|
||||
|
||||
#if defined PCRE2_PCRE2TEST || \
|
||||
(defined SUPPORT_UNICODE && \
|
||||
defined PCRE2_CODE_UNIT_WIDTH && \
|
||||
PCRE2_CODE_UNIT_WIDTH == 8)
|
||||
|
||||
/* These are the breakpoints for different numbers of bytes in a UTF-8
|
||||
character. */
|
||||
|
||||
const int PRIV(utf8_table1)[] =
|
||||
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
|
||||
|
||||
const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int);
|
||||
|
||||
/* These are the indicator bits and the mask for the data bits to set in the
|
||||
first byte of a character, indexed by the number of additional bytes. */
|
||||
|
||||
const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
/* Table of the number of extra bytes, indexed by the first byte masked with
|
||||
0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
|
||||
|
||||
const uint8_t PRIV(utf8_table4)[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
#endif /* UTF-8 support needed */
|
||||
|
||||
/* Tables concerned with Unicode properties are relevant only when Unicode
|
||||
support is enabled. See also the pcre2_ucptables.c file, which is generated by
|
||||
a Python script from Unicode data files. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
const uint32_t PRIV(ucp_gentype)[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
|
||||
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
|
||||
ucp_P, ucp_P, /* Ps, Po */
|
||||
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
/* This table encodes the rules for finding the end of an extended grapheme
|
||||
cluster. Every code point has a grapheme break property which is one of the
|
||||
ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions
|
||||
10 and 11. The 2-dimensional table is indexed by the properties of two adjacent
|
||||
code points. The left property selects a word from the table, and the right
|
||||
property selects a bit from that word like this:
|
||||
|
||||
PRIV(ucp_gbtable)[left-property] & (1u << right-property)
|
||||
|
||||
The value is non-zero if a grapheme break is NOT permitted between the relevant
|
||||
two code points. The breaking rules are as follows:
|
||||
|
||||
1. Break at the start and end of text (pretty obviously).
|
||||
|
||||
2. Do not break between a CR and LF; otherwise, break before and after
|
||||
controls.
|
||||
|
||||
3. Do not break Hangul syllable sequences, the rules for which are:
|
||||
|
||||
L may be followed by L, V, LV or LVT
|
||||
LV or V may be followed by V or T
|
||||
LVT or T may be followed by T
|
||||
|
||||
4. Do not break before extending characters or zero-width-joiner (ZWJ).
|
||||
|
||||
The following rules are only for extended grapheme clusters (but that's what we
|
||||
are implementing).
|
||||
|
||||
5. Do not break before SpacingMarks.
|
||||
|
||||
6. Do not break after Prepend characters.
|
||||
|
||||
7. Do not break within emoji modifier sequences or emoji zwj sequences. That
|
||||
is, do not break between characters with the Extended_Pictographic property.
|
||||
Extend and ZWJ characters are allowed between the characters; this cannot be
|
||||
represented in this table, the code has to deal with it.
|
||||
|
||||
8. Do not break within emoji flag sequences. That is, do not break between
|
||||
regional indicator (RI) symbols if there are an odd number of RI characters
|
||||
before the break point. This table encodes "join RI characters"; the code
|
||||
has to deal with checking for previous adjoining RIs.
|
||||
|
||||
9. Otherwise, break everywhere.
|
||||
*/
|
||||
|
||||
#define ESZ (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbZWJ)
|
||||
|
||||
const uint32_t PRIV(ucp_gbtable)[] = {
|
||||
(1u<<ucp_gbLF), /* 0 CR */
|
||||
0, /* 1 LF */
|
||||
0, /* 2 Control */
|
||||
ESZ, /* 3 Extend */
|
||||
ESZ|(1u<<ucp_gbPrepend)| /* 4 Prepend */
|
||||
(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbT)|
|
||||
(1u<<ucp_gbLV)|(1u<<ucp_gbLVT)|(1u<<ucp_gbOther)|
|
||||
(1u<<ucp_gbRegional_Indicator),
|
||||
ESZ, /* 5 SpacingMark */
|
||||
ESZ|(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbLV)| /* 6 L */
|
||||
(1u<<ucp_gbLVT),
|
||||
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 7 V */
|
||||
ESZ|(1u<<ucp_gbT), /* 8 T */
|
||||
ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 9 LV */
|
||||
ESZ|(1u<<ucp_gbT), /* 10 LVT */
|
||||
(1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */
|
||||
ESZ, /* 12 Other */
|
||||
ESZ, /* 13 ZWJ */
|
||||
ESZ|(1u<<ucp_gbExtended_Pictographic) /* 14 Extended Pictographic */
|
||||
};
|
||||
|
||||
#undef ESZ
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
/* This table reverses PRIV(ucp_gentype). We can save the cost
|
||||
of a memory load. */
|
||||
|
||||
const int PRIV(ucp_typerange)[] = {
|
||||
ucp_Cc, ucp_Cs,
|
||||
ucp_Ll, ucp_Lu,
|
||||
ucp_Mc, ucp_Mn,
|
||||
ucp_Nd, ucp_No,
|
||||
ucp_Pc, ucp_Ps,
|
||||
ucp_Sc, ucp_So,
|
||||
ucp_Zl, ucp_Zs,
|
||||
};
|
||||
#endif /* SUPPORT_JIT */
|
||||
|
||||
/* Finally, include the tables that are auto-generated from the Unicode data
|
||||
files. */
|
||||
|
||||
#include "pcre2_ucptables.inc"
|
||||
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_tables.c */
|
5396
third_party/pcre/pcre2_ucd.c
vendored
Normal file
5396
third_party/pcre/pcre2_ucd.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
5396
third_party/pcre/pcre2_ucd.inc
vendored
Normal file
5396
third_party/pcre/pcre2_ucd.inc
vendored
Normal file
File diff suppressed because it is too large
Load diff
394
third_party/pcre/pcre2_ucp.h
vendored
Normal file
394
third_party/pcre/pcre2_ucp.h
vendored
Normal file
|
@ -0,0 +1,394 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
||||
Instead, modify the maint/GenerateUcpHeader.py script and run it to generate
|
||||
a new version of this code.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
||||
|
||||
/* This file contains definitions of the Unicode property values that are
|
||||
returned by the UCD access macros and used throughout PCRE2.
|
||||
|
||||
IMPORTANT: The specific values of the first two enums (general and particular
|
||||
character categories) are assumed by the table called catposstab in the file
|
||||
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
||||
an update. */
|
||||
|
||||
/* These are the general character categories. */
|
||||
|
||||
enum {
|
||||
ucp_C,
|
||||
ucp_L,
|
||||
ucp_M,
|
||||
ucp_N,
|
||||
ucp_P,
|
||||
ucp_S,
|
||||
ucp_Z,
|
||||
};
|
||||
|
||||
/* These are the particular character categories. */
|
||||
|
||||
enum {
|
||||
ucp_Cc, /* Control */
|
||||
ucp_Cf, /* Format */
|
||||
ucp_Cn, /* Unassigned */
|
||||
ucp_Co, /* Private use */
|
||||
ucp_Cs, /* Surrogate */
|
||||
ucp_Ll, /* Lower case letter */
|
||||
ucp_Lm, /* Modifier letter */
|
||||
ucp_Lo, /* Other letter */
|
||||
ucp_Lt, /* Title case letter */
|
||||
ucp_Lu, /* Upper case letter */
|
||||
ucp_Mc, /* Spacing mark */
|
||||
ucp_Me, /* Enclosing mark */
|
||||
ucp_Mn, /* Non-spacing mark */
|
||||
ucp_Nd, /* Decimal number */
|
||||
ucp_Nl, /* Letter number */
|
||||
ucp_No, /* Other number */
|
||||
ucp_Pc, /* Connector punctuation */
|
||||
ucp_Pd, /* Dash punctuation */
|
||||
ucp_Pe, /* Close punctuation */
|
||||
ucp_Pf, /* Final punctuation */
|
||||
ucp_Pi, /* Initial punctuation */
|
||||
ucp_Po, /* Other punctuation */
|
||||
ucp_Ps, /* Open punctuation */
|
||||
ucp_Sc, /* Currency symbol */
|
||||
ucp_Sk, /* Modifier symbol */
|
||||
ucp_Sm, /* Mathematical symbol */
|
||||
ucp_So, /* Other symbol */
|
||||
ucp_Zl, /* Line separator */
|
||||
ucp_Zp, /* Paragraph separator */
|
||||
ucp_Zs, /* Space separator */
|
||||
};
|
||||
|
||||
/* These are Boolean properties. */
|
||||
|
||||
enum {
|
||||
ucp_ASCII,
|
||||
ucp_ASCII_Hex_Digit,
|
||||
ucp_Alphabetic,
|
||||
ucp_Bidi_Control,
|
||||
ucp_Bidi_Mirrored,
|
||||
ucp_Case_Ignorable,
|
||||
ucp_Cased,
|
||||
ucp_Changes_When_Casefolded,
|
||||
ucp_Changes_When_Casemapped,
|
||||
ucp_Changes_When_Lowercased,
|
||||
ucp_Changes_When_Titlecased,
|
||||
ucp_Changes_When_Uppercased,
|
||||
ucp_Dash,
|
||||
ucp_Default_Ignorable_Code_Point,
|
||||
ucp_Deprecated,
|
||||
ucp_Diacritic,
|
||||
ucp_Emoji,
|
||||
ucp_Emoji_Component,
|
||||
ucp_Emoji_Modifier,
|
||||
ucp_Emoji_Modifier_Base,
|
||||
ucp_Emoji_Presentation,
|
||||
ucp_Extended_Pictographic,
|
||||
ucp_Extender,
|
||||
ucp_Grapheme_Base,
|
||||
ucp_Grapheme_Extend,
|
||||
ucp_Grapheme_Link,
|
||||
ucp_Hex_Digit,
|
||||
ucp_IDS_Binary_Operator,
|
||||
ucp_IDS_Trinary_Operator,
|
||||
ucp_ID_Continue,
|
||||
ucp_ID_Start,
|
||||
ucp_Ideographic,
|
||||
ucp_Join_Control,
|
||||
ucp_Logical_Order_Exception,
|
||||
ucp_Lowercase,
|
||||
ucp_Math,
|
||||
ucp_Noncharacter_Code_Point,
|
||||
ucp_Pattern_Syntax,
|
||||
ucp_Pattern_White_Space,
|
||||
ucp_Prepended_Concatenation_Mark,
|
||||
ucp_Quotation_Mark,
|
||||
ucp_Radical,
|
||||
ucp_Regional_Indicator,
|
||||
ucp_Sentence_Terminal,
|
||||
ucp_Soft_Dotted,
|
||||
ucp_Terminal_Punctuation,
|
||||
ucp_Unified_Ideograph,
|
||||
ucp_Uppercase,
|
||||
ucp_Variation_Selector,
|
||||
ucp_White_Space,
|
||||
ucp_XID_Continue,
|
||||
ucp_XID_Start,
|
||||
/* This must be last */
|
||||
ucp_Bprop_Count
|
||||
};
|
||||
|
||||
/* Size of entries in ucd_boolprop_sets[] */
|
||||
|
||||
#define ucd_boolprop_sets_item_size 2
|
||||
|
||||
/* These are the bidi class values. */
|
||||
|
||||
enum {
|
||||
ucp_bidiAL, /* Arabic letter */
|
||||
ucp_bidiAN, /* Arabic number */
|
||||
ucp_bidiB, /* Paragraph separator */
|
||||
ucp_bidiBN, /* Boundary neutral */
|
||||
ucp_bidiCS, /* Common separator */
|
||||
ucp_bidiEN, /* European number */
|
||||
ucp_bidiES, /* European separator */
|
||||
ucp_bidiET, /* European terminator */
|
||||
ucp_bidiFSI, /* First strong isolate */
|
||||
ucp_bidiL, /* Left to right */
|
||||
ucp_bidiLRE, /* Left to right embedding */
|
||||
ucp_bidiLRI, /* Left to right isolate */
|
||||
ucp_bidiLRO, /* Left to right override */
|
||||
ucp_bidiNSM, /* Non-spacing mark */
|
||||
ucp_bidiON, /* Other neutral */
|
||||
ucp_bidiPDF, /* Pop directional format */
|
||||
ucp_bidiPDI, /* Pop directional isolate */
|
||||
ucp_bidiR, /* Right to left */
|
||||
ucp_bidiRLE, /* Right to left embedding */
|
||||
ucp_bidiRLI, /* Right to left isolate */
|
||||
ucp_bidiRLO, /* Right to left override */
|
||||
ucp_bidiS, /* Segment separator */
|
||||
ucp_bidiWS, /* White space */
|
||||
};
|
||||
|
||||
/* These are grapheme break properties. The Extended Pictographic property
|
||||
comes from the emoji-data.txt file. */
|
||||
|
||||
enum {
|
||||
ucp_gbCR, /* 0 */
|
||||
ucp_gbLF, /* 1 */
|
||||
ucp_gbControl, /* 2 */
|
||||
ucp_gbExtend, /* 3 */
|
||||
ucp_gbPrepend, /* 4 */
|
||||
ucp_gbSpacingMark, /* 5 */
|
||||
ucp_gbL, /* 6 Hangul syllable type L */
|
||||
ucp_gbV, /* 7 Hangul syllable type V */
|
||||
ucp_gbT, /* 8 Hangul syllable type T */
|
||||
ucp_gbLV, /* 9 Hangul syllable type LV */
|
||||
ucp_gbLVT, /* 10 Hangul syllable type LVT */
|
||||
ucp_gbRegional_Indicator, /* 11 */
|
||||
ucp_gbOther, /* 12 */
|
||||
ucp_gbZWJ, /* 13 */
|
||||
ucp_gbExtended_Pictographic, /* 14 */
|
||||
};
|
||||
|
||||
/* These are the script identifications. */
|
||||
|
||||
enum {
|
||||
/* Scripts which has characters in other scripts. */
|
||||
ucp_Latin,
|
||||
ucp_Greek,
|
||||
ucp_Cyrillic,
|
||||
ucp_Arabic,
|
||||
ucp_Syriac,
|
||||
ucp_Thaana,
|
||||
ucp_Devanagari,
|
||||
ucp_Bengali,
|
||||
ucp_Gurmukhi,
|
||||
ucp_Gujarati,
|
||||
ucp_Oriya,
|
||||
ucp_Tamil,
|
||||
ucp_Telugu,
|
||||
ucp_Kannada,
|
||||
ucp_Malayalam,
|
||||
ucp_Sinhala,
|
||||
ucp_Myanmar,
|
||||
ucp_Georgian,
|
||||
ucp_Hangul,
|
||||
ucp_Mongolian,
|
||||
ucp_Hiragana,
|
||||
ucp_Katakana,
|
||||
ucp_Bopomofo,
|
||||
ucp_Han,
|
||||
ucp_Yi,
|
||||
ucp_Tagalog,
|
||||
ucp_Hanunoo,
|
||||
ucp_Buhid,
|
||||
ucp_Tagbanwa,
|
||||
ucp_Limbu,
|
||||
ucp_Tai_Le,
|
||||
ucp_Linear_B,
|
||||
ucp_Cypriot,
|
||||
ucp_Buginese,
|
||||
ucp_Coptic,
|
||||
ucp_Glagolitic,
|
||||
ucp_Syloti_Nagri,
|
||||
ucp_Phags_Pa,
|
||||
ucp_Nko,
|
||||
ucp_Kayah_Li,
|
||||
ucp_Javanese,
|
||||
ucp_Kaithi,
|
||||
ucp_Mandaic,
|
||||
ucp_Chakma,
|
||||
ucp_Sharada,
|
||||
ucp_Takri,
|
||||
ucp_Duployan,
|
||||
ucp_Grantha,
|
||||
ucp_Khojki,
|
||||
ucp_Linear_A,
|
||||
ucp_Mahajani,
|
||||
ucp_Manichaean,
|
||||
ucp_Modi,
|
||||
ucp_Old_Permic,
|
||||
ucp_Psalter_Pahlavi,
|
||||
ucp_Khudawadi,
|
||||
ucp_Tirhuta,
|
||||
ucp_Multani,
|
||||
ucp_Adlam,
|
||||
ucp_Masaram_Gondi,
|
||||
ucp_Dogra,
|
||||
ucp_Gunjala_Gondi,
|
||||
ucp_Hanifi_Rohingya,
|
||||
ucp_Sogdian,
|
||||
ucp_Nandinagari,
|
||||
ucp_Yezidi,
|
||||
ucp_Cypro_Minoan,
|
||||
ucp_Old_Uyghur,
|
||||
|
||||
/* Scripts which has no characters in other scripts. */
|
||||
ucp_Unknown,
|
||||
ucp_Common,
|
||||
ucp_Armenian,
|
||||
ucp_Hebrew,
|
||||
ucp_Thai,
|
||||
ucp_Lao,
|
||||
ucp_Tibetan,
|
||||
ucp_Ethiopic,
|
||||
ucp_Cherokee,
|
||||
ucp_Canadian_Aboriginal,
|
||||
ucp_Ogham,
|
||||
ucp_Runic,
|
||||
ucp_Khmer,
|
||||
ucp_Old_Italic,
|
||||
ucp_Gothic,
|
||||
ucp_Deseret,
|
||||
ucp_Inherited,
|
||||
ucp_Ugaritic,
|
||||
ucp_Shavian,
|
||||
ucp_Osmanya,
|
||||
ucp_Braille,
|
||||
ucp_New_Tai_Lue,
|
||||
ucp_Tifinagh,
|
||||
ucp_Old_Persian,
|
||||
ucp_Kharoshthi,
|
||||
ucp_Balinese,
|
||||
ucp_Cuneiform,
|
||||
ucp_Phoenician,
|
||||
ucp_Sundanese,
|
||||
ucp_Lepcha,
|
||||
ucp_Ol_Chiki,
|
||||
ucp_Vai,
|
||||
ucp_Saurashtra,
|
||||
ucp_Rejang,
|
||||
ucp_Lycian,
|
||||
ucp_Carian,
|
||||
ucp_Lydian,
|
||||
ucp_Cham,
|
||||
ucp_Tai_Tham,
|
||||
ucp_Tai_Viet,
|
||||
ucp_Avestan,
|
||||
ucp_Egyptian_Hieroglyphs,
|
||||
ucp_Samaritan,
|
||||
ucp_Lisu,
|
||||
ucp_Bamum,
|
||||
ucp_Meetei_Mayek,
|
||||
ucp_Imperial_Aramaic,
|
||||
ucp_Old_South_Arabian,
|
||||
ucp_Inscriptional_Parthian,
|
||||
ucp_Inscriptional_Pahlavi,
|
||||
ucp_Old_Turkic,
|
||||
ucp_Batak,
|
||||
ucp_Brahmi,
|
||||
ucp_Meroitic_Cursive,
|
||||
ucp_Meroitic_Hieroglyphs,
|
||||
ucp_Miao,
|
||||
ucp_Sora_Sompeng,
|
||||
ucp_Caucasian_Albanian,
|
||||
ucp_Bassa_Vah,
|
||||
ucp_Elbasan,
|
||||
ucp_Pahawh_Hmong,
|
||||
ucp_Mende_Kikakui,
|
||||
ucp_Mro,
|
||||
ucp_Old_North_Arabian,
|
||||
ucp_Nabataean,
|
||||
ucp_Palmyrene,
|
||||
ucp_Pau_Cin_Hau,
|
||||
ucp_Siddham,
|
||||
ucp_Warang_Citi,
|
||||
ucp_Ahom,
|
||||
ucp_Anatolian_Hieroglyphs,
|
||||
ucp_Hatran,
|
||||
ucp_Old_Hungarian,
|
||||
ucp_SignWriting,
|
||||
ucp_Bhaiksuki,
|
||||
ucp_Marchen,
|
||||
ucp_Newa,
|
||||
ucp_Osage,
|
||||
ucp_Tangut,
|
||||
ucp_Nushu,
|
||||
ucp_Soyombo,
|
||||
ucp_Zanabazar_Square,
|
||||
ucp_Makasar,
|
||||
ucp_Medefaidrin,
|
||||
ucp_Old_Sogdian,
|
||||
ucp_Elymaic,
|
||||
ucp_Nyiakeng_Puachue_Hmong,
|
||||
ucp_Wancho,
|
||||
ucp_Chorasmian,
|
||||
ucp_Dives_Akuru,
|
||||
ucp_Khitan_Small_Script,
|
||||
ucp_Tangsa,
|
||||
ucp_Toto,
|
||||
ucp_Vithkuqi,
|
||||
|
||||
/* This must be last */
|
||||
ucp_Script_Count
|
||||
};
|
||||
|
||||
/* Size of entries in ucd_script_sets[] */
|
||||
|
||||
#define ucd_script_sets_item_size 3
|
||||
|
||||
#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
|
||||
|
||||
/* End of pcre2_ucp.h */
|
1524
third_party/pcre/pcre2_ucptables.inc
vendored
Normal file
1524
third_party/pcre/pcre2_ucptables.inc
vendored
Normal file
File diff suppressed because it is too large
Load diff
398
third_party/pcre/pcre2_valid_utf.c
vendored
Normal file
398
third_party/pcre/pcre2_valid_utf.c
vendored
Normal file
|
@ -0,0 +1,398 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains an internal function for validating UTF character
|
||||
strings. This file is also #included by the pcre2test program, which uses
|
||||
macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
|
||||
with the library. In this case, PCRE2_PCRE2TEST is defined. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST /* We're compiling the library */
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
/*************************************************
|
||||
* Dummy function when Unicode is not supported *
|
||||
*************************************************/
|
||||
|
||||
/* This function should never be called when Unicode is not supported. */
|
||||
|
||||
int
|
||||
PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
|
||||
{
|
||||
(void)string;
|
||||
(void)length;
|
||||
(void)erroroffset;
|
||||
return 0;
|
||||
}
|
||||
#else /* UTF is supported */
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Validate a UTF string *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called (optionally) at the start of compile or match, to
|
||||
check that a supposed UTF string is actually valid. The early check means
|
||||
that subsequent code can assume it is dealing with a valid string. The check
|
||||
can be turned off for maximum performance, but the consequences of supplying an
|
||||
invalid string are then undefined.
|
||||
|
||||
Arguments:
|
||||
string points to the string
|
||||
length length of string
|
||||
errp pointer to an error position offset variable
|
||||
|
||||
Returns: == 0 if the string is a valid UTF string
|
||||
!= 0 otherwise, setting the offset of the bad character
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
|
||||
{
|
||||
PCRE2_SPTR p;
|
||||
uint32_t c;
|
||||
|
||||
/* ----------------- Check a UTF-8 string ----------------- */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
|
||||
/* Originally, this function checked according to RFC 2279, allowing for values
|
||||
in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
|
||||
in the canonical format. Once somebody had pointed out RFC 3629 to me (it
|
||||
obsoletes 2279), additional restrictions were applied. The values are now
|
||||
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
|
||||
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
|
||||
characters is still checked. Error returns are as follows:
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629
|
||||
PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629
|
||||
PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
|
||||
PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted
|
||||
PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
|
||||
PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
|
||||
PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
|
||||
PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; p++)
|
||||
{
|
||||
uint32_t ab, d;
|
||||
|
||||
c = *p;
|
||||
length--;
|
||||
|
||||
if (c < 128) continue; /* ASCII character */
|
||||
|
||||
if (c < 0xc0) /* Isolated 10xx xxxx byte */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF8_ERR20;
|
||||
}
|
||||
|
||||
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF8_ERR21;
|
||||
}
|
||||
|
||||
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */
|
||||
if (length < ab) /* Missing bytes */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
switch(ab - length)
|
||||
{
|
||||
case 1: return PCRE2_ERROR_UTF8_ERR1;
|
||||
case 2: return PCRE2_ERROR_UTF8_ERR2;
|
||||
case 3: return PCRE2_ERROR_UTF8_ERR3;
|
||||
case 4: return PCRE2_ERROR_UTF8_ERR4;
|
||||
case 5: return PCRE2_ERROR_UTF8_ERR5;
|
||||
}
|
||||
}
|
||||
length -= ab; /* Length remaining */
|
||||
|
||||
/* Check top bits in the second byte */
|
||||
|
||||
if (((d = *(++p)) & 0xc0) != 0x80)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR6;
|
||||
}
|
||||
|
||||
/* For each length, check that the remaining bytes start with the 0x80 bit
|
||||
set and not the 0x40 bit. Then check for an overlong sequence, and for the
|
||||
excluded range 0xd800 to 0xdfff. */
|
||||
|
||||
switch (ab)
|
||||
{
|
||||
/* 2-byte character. No further bytes to check for 0x80. Check first byte
|
||||
for for xx00 000x (overlong sequence). */
|
||||
|
||||
case 1: if ((c & 0x3e) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR15;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
|
||||
for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
|
||||
|
||||
case 2:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if (c == 0xe0 && (d & 0x20) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR16;
|
||||
}
|
||||
if (c == 0xed && d >= 0xa0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR14;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
|
||||
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
|
||||
character greater than 0x0010ffff (f4 8f bf bf) */
|
||||
|
||||
case 3:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if (c == 0xf0 && (d & 0x30) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR17;
|
||||
}
|
||||
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR13;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
|
||||
rejected by the length test below. However, we do the appropriate tests
|
||||
here so that overlong sequences get diagnosed, and also in case there is
|
||||
ever an option for handling these larger code points. */
|
||||
|
||||
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
|
||||
1111 1000, xx00 0xxx */
|
||||
|
||||
case 4:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if (c == 0xf8 && (d & 0x38) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR18;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
|
||||
1111 1100, xx00 00xx. */
|
||||
|
||||
case 5:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR10;
|
||||
}
|
||||
if (c == 0xfc && (d & 0x3c) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR19;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
|
||||
excluded by RFC 3629. The pointer p is currently at the last byte of the
|
||||
character. */
|
||||
|
||||
if (ab > 3)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - ab;
|
||||
return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
/* ----------------- Check a UTF-16 string ----------------- */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
|
||||
/* There's not so much work, nor so many errors, for UTF-16.
|
||||
PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string
|
||||
PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate
|
||||
PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; p++)
|
||||
{
|
||||
c = *p;
|
||||
length--;
|
||||
|
||||
if ((c & 0xf800) != 0xd800)
|
||||
{
|
||||
/* Normal UTF-16 code point. Neither high nor low surrogate. */
|
||||
}
|
||||
else if ((c & 0x0400) == 0)
|
||||
{
|
||||
/* High surrogate. Must be a followed by a low surrogate. */
|
||||
if (length == 0)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF16_ERR1;
|
||||
}
|
||||
p++;
|
||||
length--;
|
||||
if ((*p & 0xfc00) != 0xdc00)
|
||||
{
|
||||
*erroroffset = p - string - 1;
|
||||
return PCRE2_ERROR_UTF16_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Isolated low surrogate. Always an error. */
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF16_ERR3;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
|
||||
/* ----------------- Check a UTF-32 string ----------------- */
|
||||
|
||||
#else
|
||||
|
||||
/* There is very little to do for a UTF-32 string.
|
||||
PCRE2_ERROR_UTF32_ERR1 Surrogate character
|
||||
PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; length--, p++)
|
||||
{
|
||||
c = *p;
|
||||
if ((c & 0xfffff800u) != 0xd800u)
|
||||
{
|
||||
/* Normal UTF-32 code point. Neither high nor low surrogate. */
|
||||
if (c > 0x10ffffu)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF32_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A surrogate */
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF32_ERR1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
#endif /* CODE_UNIT_WIDTH */
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_valid_utf.c */
|
398
third_party/pcre/pcre2_valid_utf.inc
vendored
Normal file
398
third_party/pcre/pcre2_valid_utf.inc
vendored
Normal file
|
@ -0,0 +1,398 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2020 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains an internal function for validating UTF character
|
||||
strings. This file is also #included by the pcre2test program, which uses
|
||||
macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
|
||||
with the library. In this case, PCRE2_PCRE2TEST is defined. */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST /* We're compiling the library */
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
||||
|
||||
#ifndef SUPPORT_UNICODE
|
||||
/*************************************************
|
||||
* Dummy function when Unicode is not supported *
|
||||
*************************************************/
|
||||
|
||||
/* This function should never be called when Unicode is not supported. */
|
||||
|
||||
int
|
||||
PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
|
||||
{
|
||||
(void)string;
|
||||
(void)length;
|
||||
(void)erroroffset;
|
||||
return 0;
|
||||
}
|
||||
#else /* UTF is supported */
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Validate a UTF string *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called (optionally) at the start of compile or match, to
|
||||
check that a supposed UTF string is actually valid. The early check means
|
||||
that subsequent code can assume it is dealing with a valid string. The check
|
||||
can be turned off for maximum performance, but the consequences of supplying an
|
||||
invalid string are then undefined.
|
||||
|
||||
Arguments:
|
||||
string points to the string
|
||||
length length of string
|
||||
errp pointer to an error position offset variable
|
||||
|
||||
Returns: == 0 if the string is a valid UTF string
|
||||
!= 0 otherwise, setting the offset of the bad character
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
|
||||
{
|
||||
PCRE2_SPTR p;
|
||||
uint32_t c;
|
||||
|
||||
/* ----------------- Check a UTF-8 string ----------------- */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
|
||||
/* Originally, this function checked according to RFC 2279, allowing for values
|
||||
in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
|
||||
in the canonical format. Once somebody had pointed out RFC 3629 to me (it
|
||||
obsoletes 2279), additional restrictions were applied. The values are now
|
||||
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
|
||||
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
|
||||
characters is still checked. Error returns are as follows:
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string
|
||||
PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80
|
||||
PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629
|
||||
PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629
|
||||
PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
|
||||
PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted
|
||||
PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence
|
||||
PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
|
||||
PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
|
||||
PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
|
||||
PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; p++)
|
||||
{
|
||||
uint32_t ab, d;
|
||||
|
||||
c = *p;
|
||||
length--;
|
||||
|
||||
if (c < 128) continue; /* ASCII character */
|
||||
|
||||
if (c < 0xc0) /* Isolated 10xx xxxx byte */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF8_ERR20;
|
||||
}
|
||||
|
||||
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF8_ERR21;
|
||||
}
|
||||
|
||||
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */
|
||||
if (length < ab) /* Missing bytes */
|
||||
{
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
switch(ab - length)
|
||||
{
|
||||
case 1: return PCRE2_ERROR_UTF8_ERR1;
|
||||
case 2: return PCRE2_ERROR_UTF8_ERR2;
|
||||
case 3: return PCRE2_ERROR_UTF8_ERR3;
|
||||
case 4: return PCRE2_ERROR_UTF8_ERR4;
|
||||
case 5: return PCRE2_ERROR_UTF8_ERR5;
|
||||
}
|
||||
}
|
||||
length -= ab; /* Length remaining */
|
||||
|
||||
/* Check top bits in the second byte */
|
||||
|
||||
if (((d = *(++p)) & 0xc0) != 0x80)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR6;
|
||||
}
|
||||
|
||||
/* For each length, check that the remaining bytes start with the 0x80 bit
|
||||
set and not the 0x40 bit. Then check for an overlong sequence, and for the
|
||||
excluded range 0xd800 to 0xdfff. */
|
||||
|
||||
switch (ab)
|
||||
{
|
||||
/* 2-byte character. No further bytes to check for 0x80. Check first byte
|
||||
for for xx00 000x (overlong sequence). */
|
||||
|
||||
case 1: if ((c & 0x3e) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR15;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
|
||||
for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
|
||||
|
||||
case 2:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if (c == 0xe0 && (d & 0x20) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR16;
|
||||
}
|
||||
if (c == 0xed && d >= 0xa0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR14;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
|
||||
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
|
||||
character greater than 0x0010ffff (f4 8f bf bf) */
|
||||
|
||||
case 3:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if (c == 0xf0 && (d & 0x30) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR17;
|
||||
}
|
||||
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR13;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
|
||||
rejected by the length test below. However, we do the appropriate tests
|
||||
here so that overlong sequences get diagnosed, and also in case there is
|
||||
ever an option for handling these larger code points. */
|
||||
|
||||
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
|
||||
1111 1000, xx00 0xxx */
|
||||
|
||||
case 4:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if (c == 0xf8 && (d & 0x38) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR18;
|
||||
}
|
||||
break;
|
||||
|
||||
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
|
||||
1111 1100, xx00 00xx. */
|
||||
|
||||
case 5:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR10;
|
||||
}
|
||||
if (c == 0xfc && (d & 0x3c) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR19;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
|
||||
excluded by RFC 3629. The pointer p is currently at the last byte of the
|
||||
character. */
|
||||
|
||||
if (ab > 3)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - ab;
|
||||
return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
/* ----------------- Check a UTF-16 string ----------------- */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
|
||||
/* There's not so much work, nor so many errors, for UTF-16.
|
||||
PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string
|
||||
PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate
|
||||
PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; p++)
|
||||
{
|
||||
c = *p;
|
||||
length--;
|
||||
|
||||
if ((c & 0xf800) != 0xd800)
|
||||
{
|
||||
/* Normal UTF-16 code point. Neither high nor low surrogate. */
|
||||
}
|
||||
else if ((c & 0x0400) == 0)
|
||||
{
|
||||
/* High surrogate. Must be a followed by a low surrogate. */
|
||||
if (length == 0)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF16_ERR1;
|
||||
}
|
||||
p++;
|
||||
length--;
|
||||
if ((*p & 0xfc00) != 0xdc00)
|
||||
{
|
||||
*erroroffset = p - string - 1;
|
||||
return PCRE2_ERROR_UTF16_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Isolated low surrogate. Always an error. */
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF16_ERR3;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
|
||||
|
||||
/* ----------------- Check a UTF-32 string ----------------- */
|
||||
|
||||
#else
|
||||
|
||||
/* There is very little to do for a UTF-32 string.
|
||||
PCRE2_ERROR_UTF32_ERR1 Surrogate character
|
||||
PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff
|
||||
*/
|
||||
|
||||
for (p = string; length > 0; length--, p++)
|
||||
{
|
||||
c = *p;
|
||||
if ((c & 0xfffff800u) != 0xd800u)
|
||||
{
|
||||
/* Normal UTF-32 code point. Neither high nor low surrogate. */
|
||||
if (c > 0x10ffffu)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF32_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A surrogate */
|
||||
*erroroffset = p - string;
|
||||
return PCRE2_ERROR_UTF32_ERR1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
#endif /* CODE_UNIT_WIDTH */
|
||||
}
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
/* End of pcre2_valid_utf.c */
|
289
third_party/pcre/pcre2_xclass.c
vendored
Normal file
289
third_party/pcre/pcre2_xclass.c
vendored
Normal file
|
@ -0,0 +1,289 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains an internal function that is used to match an extended
|
||||
class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
|
||||
pcre2_def_match(). */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
/*************************************************
|
||||
* Match character against an XCLASS *
|
||||
*************************************************/
|
||||
|
||||
/* This function is called to match a character against an extended class that
|
||||
might contain codepoints above 255 and/or Unicode properties.
|
||||
|
||||
Arguments:
|
||||
c the character
|
||||
data points to the flag code unit of the XCLASS data
|
||||
utf TRUE if in UTF mode
|
||||
|
||||
Returns: TRUE if character matches, else FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
|
||||
{
|
||||
PCRE2_UCHAR t;
|
||||
BOOL negated = (*data & XCL_NOT) != 0;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
|
||||
utf = TRUE;
|
||||
#endif
|
||||
|
||||
/* Code points < 256 are matched against a bitmap, if one is present. If not,
|
||||
we still carry on, because there may be ranges that start below 256 in the
|
||||
additional data. */
|
||||
|
||||
if (c < 256)
|
||||
{
|
||||
if ((*data & XCL_HASPROP) == 0)
|
||||
{
|
||||
if ((*data & XCL_MAP) == 0) return negated;
|
||||
return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0;
|
||||
}
|
||||
if ((*data & XCL_MAP) != 0 &&
|
||||
(((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0)
|
||||
return !negated; /* char found */
|
||||
}
|
||||
|
||||
/* First skip the bit map if present. Then match against the list of Unicode
|
||||
properties or large chars or ranges that end with a large char. We won't ever
|
||||
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
|
||||
|
||||
if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
|
||||
|
||||
while ((t = *data++) != XCL_END)
|
||||
{
|
||||
uint32_t x, y;
|
||||
if (t == XCL_SINGLE)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
GETCHARINC(x, data); /* macro generates multiple statements */
|
||||
}
|
||||
else
|
||||
#endif
|
||||
x = *data++;
|
||||
if (c == x) return !negated;
|
||||
}
|
||||
else if (t == XCL_RANGE)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
GETCHARINC(x, data); /* macro generates multiple statements */
|
||||
GETCHARINC(y, data); /* macro generates multiple statements */
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
x = *data++;
|
||||
y = *data++;
|
||||
}
|
||||
if (c >= x && c <= y) return !negated;
|
||||
}
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
else /* XCL_PROP & XCL_NOTPROP */
|
||||
{
|
||||
const ucd_record *prop = GET_UCD(c);
|
||||
BOOL isprop = t == XCL_PROP;
|
||||
BOOL ok;
|
||||
|
||||
switch(*data)
|
||||
{
|
||||
case PT_ANY:
|
||||
if (isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt) == isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_PC:
|
||||
if ((data[1] == prop->chartype) == isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_SC:
|
||||
if ((data[1] == prop->script) == isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_SCX:
|
||||
ok = (data[1] == prop->script ||
|
||||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
|
||||
if (ok == isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_ALNUM:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
|
||||
which means that Perl space and POSIX space are now identical. PCRE
|
||||
was changed at release 8.34. */
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
switch(c)
|
||||
{
|
||||
HSPACE_CASES:
|
||||
VSPACE_CASES:
|
||||
if (isprop) return !negated;
|
||||
break;
|
||||
|
||||
default:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
||||
== isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_UCNC:
|
||||
if (c < 0xa0)
|
||||
{
|
||||
if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
||||
c == CHAR_GRAVE_ACCENT) == isprop)
|
||||
return !negated;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((c < 0xd800 || c > 0xdfff) == isprop)
|
||||
return !negated;
|
||||
}
|
||||
break;
|
||||
|
||||
case PT_BIDICL:
|
||||
if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_BOOL:
|
||||
ok = MAPBIT(PRIV(ucd_boolprop_sets) +
|
||||
UCD_BPROPS_PROP(prop), data[1]) != 0;
|
||||
if (ok == isprop) return !negated;
|
||||
break;
|
||||
|
||||
/* The following three properties can occur only in an XCLASS, as there
|
||||
is no \p or \P coding for them. */
|
||||
|
||||
/* Graphic character. Implement this as not Z (space or separator) and
|
||||
not C (other), except for Cf (format) with a few exceptions. This seems
|
||||
to be what Perl does. The exceptional characters are:
|
||||
|
||||
U+061C Arabic Letter Mark
|
||||
U+180E Mongolian Vowel Separator
|
||||
U+2066 - U+2069 Various "isolate"s
|
||||
*/
|
||||
|
||||
case PT_PXGRAPH:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
|
||||
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
|
||||
(prop->chartype == ucp_Cf &&
|
||||
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
|
||||
)) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* Printable character: same as graphic, with the addition of Zs, i.e.
|
||||
not Zl and not Zp, and U+180E. */
|
||||
|
||||
case PT_PXPRINT:
|
||||
if ((prop->chartype != ucp_Zl &&
|
||||
prop->chartype != ucp_Zp &&
|
||||
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
|
||||
(prop->chartype == ucp_Cf &&
|
||||
c != 0x061c && (c < 0x2066 || c > 0x2069))
|
||||
)) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* Punctuation: all Unicode punctuation, plus ASCII characters that
|
||||
Unicode treats as symbols rather than punctuation, for Perl
|
||||
compatibility (these are $+<=>^`|~). */
|
||||
|
||||
case PT_PXPUNCT:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
|
||||
(c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* This should never occur, but compilers may mutter if there is no
|
||||
default. */
|
||||
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
data += 2;
|
||||
}
|
||||
#else
|
||||
(void)utf; /* Avoid compiler warning */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
return negated; /* char did not match */
|
||||
}
|
||||
|
||||
/* End of pcre2_xclass.c */
|
975
third_party/pcre/pcre2grep.1
vendored
Normal file
975
third_party/pcre/pcre2grep.1
vendored
Normal file
|
@ -0,0 +1,975 @@
|
|||
.TH PCRE2GREP 1 "21 November 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
.B pcre2grep [options] [long options] [pattern] [path1 path2 ...]
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2grep\fP searches files for character patterns, in the same way as other
|
||||
grep commands do, but it uses the PCRE2 regular expression library to support
|
||||
patterns that are compatible with the regular expressions of Perl 5. See
|
||||
.\" HREF
|
||||
\fBpcre2syntax\fP(3)
|
||||
.\"
|
||||
for a quick-reference summary of pattern syntax, or
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP(3)
|
||||
.\"
|
||||
for a full description of the syntax and semantics of the regular expressions
|
||||
that PCRE2 supports.
|
||||
.P
|
||||
Patterns, whether supplied on the command line or in a separate file, are given
|
||||
without delimiters. For example:
|
||||
.sp
|
||||
pcre2grep Thursday /etc/motd
|
||||
.sp
|
||||
If you attempt to use delimiters (for example, by surrounding a pattern with
|
||||
slashes, as is common in Perl scripts), they are interpreted as part of the
|
||||
pattern. Quotes can of course be used to delimit patterns on the command line
|
||||
because they are interpreted by the shell, and indeed quotes are required if a
|
||||
pattern contains white space or shell metacharacters.
|
||||
.P
|
||||
The first argument that follows any option settings is treated as the single
|
||||
pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
|
||||
Conversely, when one or both of these options are used to specify patterns, all
|
||||
arguments are treated as path names. At least one of \fB-e\fP, \fB-f\fP, or an
|
||||
argument pattern must be provided.
|
||||
.P
|
||||
If no files are specified, \fBpcre2grep\fP reads the standard input. The
|
||||
standard input can also be referenced by a name consisting of a single hyphen.
|
||||
For example:
|
||||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
\fB--max-buffer-size\fP options. The first of these sets the size of buffer
|
||||
that is obtained at the start of processing. If an input file contains very
|
||||
long lines, a larger buffer may be needed; this is handled by automatically
|
||||
extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The
|
||||
default values for these parameters can be set when \fBpcre2grep\fP is
|
||||
built; if nothing is specified, the defaults are set to 20KiB and 1MiB
|
||||
respectively. An error occurs if a line is too long and the buffer can no
|
||||
longer be expanded.
|
||||
.P
|
||||
The block of memory that is actually used is three times the "buffer size", to
|
||||
allow for buffering "before" and "after" lines. If the buffer size is too
|
||||
small, fewer than requested "before" and "after" lines may be output.
|
||||
.P
|
||||
Patterns can be no longer than 8KiB or BUFSIZ bytes, whichever is the greater.
|
||||
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
|
||||
(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
|
||||
each line in the order in which they are defined, except that all the \fB-e\fP
|
||||
patterns are tried before the \fB-f\fP patterns.
|
||||
.P
|
||||
By default, as soon as one pattern matches a line, no further patterns are
|
||||
considered. However, if \fB--colour\fP (or \fB--color\fP) is used to colour the
|
||||
matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP,
|
||||
\fB--line-offsets\fP, or \fB--output\fP is used to output only the part of the
|
||||
line that matched (either shown literally, or as an offset), the behaviour is
|
||||
different. In this situation, all the patterns are applied to the line. If
|
||||
there is more than one match, the one that begins nearest to the start of the
|
||||
subject is processed; if there is more than one match at that position, the one
|
||||
with the longest matching substring is processed; if the matching substrings
|
||||
are equal, the first match found is processed.
|
||||
.P
|
||||
Scanning with all the patterns resumes immediately following the match, so that
|
||||
later matches on the same line can be found. Note, however, that an overlapping
|
||||
match that starts in the middle of another match will not be processed.
|
||||
.P
|
||||
The above behaviour was changed at release 10.41 to be more compatible with GNU
|
||||
grep. In earlier releases, \fBpcre2grep\fP did not recognize matches from
|
||||
later patterns that were earlier in the subject.
|
||||
.P
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
matches are never recognized. An example is the pattern "(super)?(man)?", in
|
||||
which all components are optional. This pattern finds all occurrences of both
|
||||
"super" and "man"; the output differs from matching with "super|man" when only
|
||||
the matching substrings are being shown.
|
||||
.P
|
||||
If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
|
||||
\fBpcre2grep\fP uses the value to set a locale when calling the PCRE2 library.
|
||||
The \fB--locale\fP option can be used to override this.
|
||||
.
|
||||
.
|
||||
.SH "SUPPORT FOR COMPRESSED FILES"
|
||||
.rs
|
||||
.sp
|
||||
Compile-time options for \fBpcre2grep\fP can set it up to use \fBlibz\fP or
|
||||
\fBlibbz2\fP for reading compressed files whose names end in \fB.gz\fP or
|
||||
\fB.bz2\fP, respectively. You can find out whether your \fBpcre2grep\fP binary
|
||||
has support for one or both of these file types by running it with the
|
||||
\fB--help\fP option. If the appropriate support is not present, all files are
|
||||
treated as plain text. The standard input is always so treated. If a file with
|
||||
a \fB.gz\fP or \fB.bz2\fP extension is not in fact compressed, it is read as a
|
||||
plain text file. When input is from a compressed .gz or .bz2 file, the
|
||||
\fB--line-buffered\fP option is ignored.
|
||||
.
|
||||
.
|
||||
.SH "BINARY FILES"
|
||||
.rs
|
||||
.sp
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the \fB--binary-files\fP
|
||||
option for a means of changing the way binary files are handled.
|
||||
.
|
||||
.
|
||||
.SH "BINARY ZEROS IN PATTERNS"
|
||||
.rs
|
||||
.sp
|
||||
Patterns passed from the command line are strings that are terminated by a
|
||||
binary zero, so cannot contain internal zeros. However, patterns that are read
|
||||
from a file via the \fB-f\fP option may contain binary zeros.
|
||||
.
|
||||
.
|
||||
.SH OPTIONS
|
||||
.rs
|
||||
.sp
|
||||
The order in which some of the options appear can affect the output. For
|
||||
example, both the \fB-H\fP and \fB-l\fP options affect the printing of file
|
||||
names. Whichever comes later in the command line will be the one that takes
|
||||
effect. Similarly, except where noted below, if an option is given twice, the
|
||||
later setting is used. Numerical values for options may be followed by K or M,
|
||||
to signify multiplication by 1024 or 1024*1024 respectively.
|
||||
.TP 10
|
||||
\fB--\fP
|
||||
This terminates the list of options. It is useful if the next item on the
|
||||
command line starts with a hyphen but is not an option. This allows for the
|
||||
processing of patterns and file names that start with hyphens.
|
||||
.TP
|
||||
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
|
||||
Output up to \fInumber\fP lines of context after each matching line. Fewer
|
||||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
\fB--binary-files\fP=\fItext\fP.
|
||||
.TP
|
||||
\fB--allow-lookaround-bsk\fP
|
||||
PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
|
||||
This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
.TP
|
||||
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||
Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
Specify how binary files are to be processed. If the word is "binary" (the
|
||||
default), pattern matching is performed on binary files, but the only output is
|
||||
"Binary file <name> matches" when a match succeeds. If the word is "text",
|
||||
which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are
|
||||
processed in the same way as any other file. In this case, when a match
|
||||
succeeds, the output may be binary garbage, which can have nasty effects if
|
||||
sent to a terminal. If the word is "without-match", which is equivalent to the
|
||||
\fB-I\fP option, binary files are not processed at all; they are assumed not to
|
||||
be of interest and are skipped without causing any output or affecting the
|
||||
return code.
|
||||
.TP
|
||||
\fB--buffer-size=\fP\fInumber\fP
|
||||
Set the parameter that controls how much memory is obtained at the start of
|
||||
processing for buffering files that are being scanned. See also
|
||||
\fB--max-buffer-size\fP below.
|
||||
.TP
|
||||
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
|
||||
Output \fInumber\fP lines of context both before and after each matching line.
|
||||
This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value.
|
||||
.TP
|
||||
\fB-c\fP, \fB--count\fP
|
||||
Do not output lines from the files that are being scanned; instead output the
|
||||
number of lines that would have been shown, either because they matched, or, if
|
||||
\fB-v\fP is set, because they failed to match. By default, this count is
|
||||
exactly the same as the number of lines that would have been output, but if the
|
||||
\fB-M\fP (multiline) option is used (without \fB-v\fP), there may be more
|
||||
suppressed lines than the count (that is, the number of matches).
|
||||
.sp
|
||||
If no lines are selected, the number zero is output. If several files are are
|
||||
being scanned, a count is output for each of them and the \fB-t\fP option can
|
||||
be used to cause a total to be output at the end. However, if the
|
||||
\fB--files-with-matches\fP option is also used, only those files whose counts
|
||||
are greater than zero are listed. When \fB-c\fP is used, the \fB-A\fP,
|
||||
\fB-B\fP, and \fB-C\fP options are ignored.
|
||||
.TP
|
||||
\fB--colour\fP, \fB--color\fP
|
||||
If this option is given without any data, it is equivalent to "--colour=auto".
|
||||
If data is required, it must be given in the same shell item, separated by an
|
||||
equals sign.
|
||||
.TP
|
||||
\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
|
||||
This option specifies under what circumstances the parts of a line that matched
|
||||
a pattern should be coloured in the output. It is ignored if
|
||||
\fB--file-offsets\fP, \fB--line-offsets\fP, or \fB--output\fP is set. By
|
||||
default, output is not coloured. The value for the \fB--colour\fP option (which
|
||||
is optional, see above) may be "never", "always", or "auto". In the latter
|
||||
case, colouring happens only if the standard output is connected to a terminal.
|
||||
More resources are used when colouring is enabled, because \fBpcre2grep\fP has
|
||||
to search for all possible matches in a line, not just one, in order to colour
|
||||
them all.
|
||||
.sp
|
||||
The colour that is used can be specified by setting one of the environment
|
||||
variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or
|
||||
PCREGREP_COLOR, which are checked in that order. If none of these are set,
|
||||
\fBpcre2grep\fP looks for GREP_COLORS or GREP_COLOR (in that order). The value
|
||||
of the variable should be a string of two numbers, separated by a semicolon,
|
||||
except in the case of GREP_COLORS, which must start with "ms=" or "mt="
|
||||
followed by two semicolon-separated colours, terminated by the end of the
|
||||
string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is
|
||||
ignored, and GREP_COLOR is checked.
|
||||
.sp
|
||||
If the string obtained from one of the above variables contains any characters
|
||||
other than semicolon or digits, the setting is ignored and the default colour
|
||||
is used. The string is copied directly into the control string for setting
|
||||
colour on a terminal, so it is your responsibility to ensure that the values
|
||||
make sense. If no relevant environment variable is set, the default is "1;31",
|
||||
which gives red.
|
||||
.TP
|
||||
\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
|
||||
If an input path is not a regular file or a directory, "action" specifies how
|
||||
it is to be processed. Valid values are "read" (the default) or "skip"
|
||||
(silently skip the path).
|
||||
.TP
|
||||
\fB-d\fP \fIaction\fP, \fB--directories=\fP\fIaction\fP
|
||||
If an input path is a directory, "action" specifies how it is to be processed.
|
||||
Valid values are "read" (the default in non-Windows environments, for
|
||||
compatibility with GNU grep), "recurse" (equivalent to the \fB-r\fP option), or
|
||||
"skip" (silently skip the path, the default in Windows environments). In the
|
||||
"read" case, directories are read as if they were ordinary files. In some
|
||||
operating systems the effect of reading a directory like this is an immediate
|
||||
end-of-file; in others it may provoke an error.
|
||||
.TP
|
||||
\fB--depth-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
.TP
|
||||
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
|
||||
Specify a pattern to be matched. This option can be used multiple times in
|
||||
order to specify several patterns. It can also be used as a way of specifying a
|
||||
single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
|
||||
pattern is taken from the command line; all arguments are treated as file
|
||||
names. There is no limit to the number of patterns. They are applied to each
|
||||
line in the order in which they are defined.
|
||||
.sp
|
||||
If \fB-f\fP is used with \fB-e\fP, the command line patterns are matched first,
|
||||
followed by the patterns from the file(s), independent of the order in which
|
||||
these options are specified.
|
||||
.TP
|
||||
\fB--exclude\fP=\fIpattern\fP
|
||||
Files (but not directories) whose names match the pattern are skipped without
|
||||
being processed. This applies to all files, whether listed on the command line,
|
||||
obtained from \fB--file-list\fP, or by scanning a directory. The pattern is a
|
||||
PCRE2 regular expression, and is matched against the final component of the
|
||||
file name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do
|
||||
not apply to this pattern. The option may be given any number of times in order
|
||||
to specify multiple patterns. If a file name matches both an \fB--include\fP
|
||||
and an \fB--exclude\fP pattern, it is excluded. There is no short form for this
|
||||
option.
|
||||
.TP
|
||||
\fB--exclude-from=\fP\fIfilename\fP
|
||||
Treat each non-empty line of the file as the data for an \fB--exclude\fP
|
||||
option. What constitutes a newline when reading the file is the operating
|
||||
system's default. The \fB--newline\fP option has no effect on this option. This
|
||||
option may be given more than once in order to specify a number of files to
|
||||
read.
|
||||
.TP
|
||||
\fB--exclude-dir\fP=\fIpattern\fP
|
||||
Directories whose names match the pattern are skipped without being processed,
|
||||
whatever the setting of the \fB--recursive\fP option. This applies to all
|
||||
directories, whether listed on the command line, obtained from
|
||||
\fB--file-list\fP, or by scanning a parent directory. The pattern is a PCRE2
|
||||
regular expression, and is matched against the final component of the directory
|
||||
name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not
|
||||
apply to this pattern. The option may be given any number of times in order to
|
||||
specify more than one pattern. If a directory matches both \fB--include-dir\fP
|
||||
and \fB--exclude-dir\fP, it is excluded. There is no short form for this
|
||||
option.
|
||||
.TP
|
||||
\fB-F\fP, \fB--fixed-strings\fP
|
||||
Interpret each data-matching pattern as a list of fixed strings, separated by
|
||||
newlines, instead of as a regular expression. What constitutes a newline for
|
||||
this purpose is controlled by the \fB--newline\fP option. The \fB-w\fP (match
|
||||
as a word) and \fB-x\fP (match whole line) options can be used with \fB-F\fP.
|
||||
They apply to each of the fixed strings. A line is selected if any of the fixed
|
||||
strings are found in it (subject to \fB-w\fP or \fB-x\fP, if present). This
|
||||
option applies only to the patterns that are matched against the contents of
|
||||
files; it does not apply to patterns specified by any of the \fB--include\fP or
|
||||
\fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP
|
||||
Read patterns from the file, one per line. As is the case with patterns on the
|
||||
command line, no delimiters should be used. What constitutes a newline when
|
||||
reading the file is the operating system's default interpretation of \en. The
|
||||
\fB--newline\fP option has no effect on this option. Trailing white space is
|
||||
removed from each line, and blank lines are ignored. An empty file contains no
|
||||
patterns and therefore matches nothing. Patterns read from a file in this way
|
||||
may contain binary zeros, which are treated as ordinary data characters.
|
||||
.sp
|
||||
If this option is given more than once, all the specified files are read. A
|
||||
data line is output if any of the patterns match it. A file name can be given
|
||||
as "-" to refer to the standard input. When \fB-f\fP is used, patterns
|
||||
specified on the command line using \fB-e\fP may also be present; they are
|
||||
matched before the file's patterns. However, no pattern is taken from the
|
||||
command line; all arguments are treated as the names of paths to be searched.
|
||||
.TP
|
||||
\fB--file-list\fP=\fIfilename\fP
|
||||
Read a list of files and/or directories that are to be scanned from the given
|
||||
file, one per line. What constitutes a newline when reading the file is the
|
||||
operating system's default. Trailing white space is removed from each line, and
|
||||
blank lines are ignored. These paths are processed before any that are listed
|
||||
on the command line. The file name can be given as "-" to refer to the standard
|
||||
input. If \fB--file\fP and \fB--file-list\fP are both specified as "-",
|
||||
patterns are read first. This is useful only when the standard input is a
|
||||
terminal, from which further lines (the list of files) can be read after an
|
||||
end-of-file indication. If this option is given more than once, all the
|
||||
specified files are read.
|
||||
.TP
|
||||
\fB--file-offsets\fP
|
||||
Instead of showing lines or parts of lines that match, show each match as an
|
||||
offset from the start of the file and a length, separated by a comma. In this
|
||||
mode, \fB--colour\fP has no effect, and no context is shown. That is, the
|
||||
\fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one
|
||||
match in a line, each of them is shown separately. This option is mutually
|
||||
exclusive with \fB--output\fP, \fB--line-offsets\fP, and \fB--only-matching\fP.
|
||||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
.TP
|
||||
\fB--help\fP
|
||||
Output a help message, giving brief details of the command options and file
|
||||
type support, and then exit. Anything else on the command line is
|
||||
ignored.
|
||||
.TP
|
||||
\fB-I\fP
|
||||
Ignore binary files. This is equivalent to
|
||||
\fB--binary-files\fP=\fIwithout-match\fP.
|
||||
.TP
|
||||
\fB-i\fP, \fB--ignore-case\fP
|
||||
Ignore upper/lower case distinctions during comparisons.
|
||||
.TP
|
||||
\fB--include\fP=\fIpattern\fP
|
||||
If any \fB--include\fP patterns are specified, the only files that are
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
\fB--exclude\fP pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to
|
||||
this pattern. The option may be given any number of times. If a file name
|
||||
matches both an \fB--include\fP and an \fB--exclude\fP pattern, it is excluded.
|
||||
There is no short form for this option.
|
||||
.TP
|
||||
\fB--include-from=\fP\fIfilename\fP
|
||||
Treat each non-empty line of the file as the data for an \fB--include\fP
|
||||
option. What constitutes a newline for this purpose is the operating system's
|
||||
default. The \fB--newline\fP option has no effect on this option. This option
|
||||
may be given any number of times; all the files are read.
|
||||
.TP
|
||||
\fB--include-dir\fP=\fIpattern\fP
|
||||
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The \fB-F\fP,
|
||||
\fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be
|
||||
given any number of times. If a directory matches both \fB--include-dir\fP and
|
||||
\fB--exclude-dir\fP, it is excluded. There is no short form for this option.
|
||||
.TP
|
||||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB--label\fP=\fIname\fP
|
||||
This option supplies a name to be used for the standard input when file names
|
||||
are being output. If not supplied, "(standard input)" is used. There is no
|
||||
short form for this option.
|
||||
.TP
|
||||
\fB--line-buffered\fP
|
||||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
||||
ceases to work. When input is from a compressed .gz or .bz2 file,
|
||||
\fB--line-buffered\fP is ignored.
|
||||
.TP
|
||||
\fB--line-offsets\fP
|
||||
Instead of showing lines or parts of lines that match, show each match as a
|
||||
line number, the offset from the start of the line, and a length. The line
|
||||
number is terminated by a colon (as usual; see the \fB-n\fP option), and the
|
||||
offset and length are separated by a comma. In this mode, \fB--colour\fP has no
|
||||
effect, and no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
|
||||
options are ignored. If there is more than one match in a line, each of them is
|
||||
shown separately. This option is mutually exclusive with \fB--output\fP,
|
||||
\fB--file-offsets\fP, and \fB--only-matching\fP.
|
||||
.TP
|
||||
\fB--locale\fP=\fIlocale-name\fP
|
||||
This option specifies a locale to be used for pattern matching. It overrides
|
||||
the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
||||
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||
used. There is no short form for this option.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with \fB-M\fP may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
.sp
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
the pattern. For example, to find the phrase "regular expression" in a file
|
||||
where "regular" might be at the end of a line and "expression" at the start of
|
||||
the next line, you could use this command:
|
||||
.sp
|
||||
pcre2grep -M 'regular\es+expression' <file>
|
||||
.sp
|
||||
The \es escape sequence matches any white space character, including newlines,
|
||||
and is followed by + so as to match trailing white space on the first line as
|
||||
well as possibly handling a two-character newline sequence.
|
||||
.sp
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||
.TP
|
||||
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
|
||||
Stop processing after finding \fInumber\fP matching lines, or non-matching
|
||||
lines if \fB-v\fP is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If \fB-c\fP is also set, the count that is output is never greater than
|
||||
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
|
||||
\fB-q\fP, or when just checking for a match in a binary file.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
.sp
|
||||
The \fB--match-limit\fP option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than \fB--match-limit\fP.
|
||||
.sp
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
.TP
|
||||
\fB--max-buffer-size\fP=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
.sp
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
.sp
|
||||
The newline type may be specified in upper, lower, or mixed case. If the
|
||||
newline type is NUL, lines are separated by binary zero characters. The other
|
||||
types are the single-character sequences CR (carriage return) and LF
|
||||
(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
|
||||
recognizes any of the preceding three types, and an "any" type, for which any
|
||||
Unicode line ending sequence is assumed to end a line. The Unicode sequences
|
||||
are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
.sp
|
||||
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, \fBpcre2grep\fP uses the library's default.
|
||||
.sp
|
||||
This option makes it possible to use \fBpcre2grep\fP to scan files that have
|
||||
come from other environments without having to modify their line endings. If
|
||||
the data that is being scanned does not agree with the convention set by this
|
||||
option, \fBpcre2grep\fP may behave in strange ways. Note that this option does
|
||||
not apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options, which are expected to use the operating system's
|
||||
standard newline sequence.
|
||||
.TP
|
||||
\fB-n\fP, \fB--line-number\fP
|
||||
Precede each output line by its line number in the file, followed by a colon
|
||||
for matching lines or a hyphen for context lines. If the file name is also
|
||||
being output, it precedes the line number. When the \fB-M\fP option causes a
|
||||
pattern to match more than one line, only the first is preceded by its line
|
||||
number. This option is forced if \fB--line-offsets\fP is used.
|
||||
.TP
|
||||
\fB--no-jit\fP
|
||||
If the PCRE2 library is built with support for just-in-time compiling (which
|
||||
speeds up matching), \fBpcre2grep\fP automatically makes use of this, unless it
|
||||
was explicitly disabled at build time. This option can be used to disable the
|
||||
use of JIT at run time. It is provided for testing and working round problems.
|
||||
It should never be needed in normal use.
|
||||
.TP
|
||||
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, \fB--colour\fP has no effect, and no context is shown.
|
||||
That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. The
|
||||
\fB--newline\fP option has no effect on this option, which is mutually
|
||||
exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and
|
||||
\fB--line-offsets\fP. However, like \fB--only-matching\fP, if there is more
|
||||
than one match in a line, each of them causes a line of output.
|
||||
.sp
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
.sp
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
.sp
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
.sp
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
.sp
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar.
|
||||
.TP
|
||||
\fB-o\fP, \fB--only-matching\fP
|
||||
Show only the part of the line that matched a pattern instead of the whole
|
||||
line. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and
|
||||
\fB-C\fP options are ignored. If there is more than one match in a line, each
|
||||
of them is shown separately, on a separate line of output. If \fB-o\fP is
|
||||
combined with \fB-v\fP (invert the sense of the match to find non-matching
|
||||
lines), no output is generated, but the return code is set appropriately. If
|
||||
the matched portion of the line is empty, nothing is output unless the file
|
||||
name or line number are being printed, in which case they are shown on an
|
||||
otherwise empty line. This option is mutually exclusive with \fB--output\fP,
|
||||
\fB--file-offsets\fP and \fB--line-offsets\fP.
|
||||
.TP
|
||||
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||||
Show only the part of the line that matched the capturing parentheses of the
|
||||
given number. Up to 50 capturing parentheses are supported by default. This
|
||||
limit can be changed via the \fB--om-capture\fP option. A pattern may contain
|
||||
any number of capturing parentheses, but only those whose number is within the
|
||||
limit can be accessed by \fB-o\fP. An error occurs if the number specified by
|
||||
\fB-o\fP is greater than the limit.
|
||||
.sp
|
||||
-o0 is the same as \fB-o\fP without a number. Because these options can be
|
||||
given without an argument (see above), if an argument is present, it must be
|
||||
given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||
comments given for the non-argument case above also apply to this option. If
|
||||
the specified capturing parentheses do not exist in the pattern, or were not
|
||||
set in the match, nothing is output unless the file name or line number are
|
||||
being output.
|
||||
.sp
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
but one option).
|
||||
.TP
|
||||
\fB--om-capture\fP=\fInumber\fP
|
||||
Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
|
||||
default is 50.
|
||||
.TP
|
||||
\fB--om-separator\fP=\fItext\fP
|
||||
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||||
is an empty string. Separating strings are never coloured.
|
||||
.TP
|
||||
\fB-q\fP, \fB--quiet\fP
|
||||
Work quietly, that is, display nothing except error messages. The exit
|
||||
status indicates whether or not any matches were found.
|
||||
.TP
|
||||
\fB-r\fP, \fB--recursive\fP
|
||||
If any given path is a directory, recursively scan the files it contains,
|
||||
taking note of any \fB--include\fP and \fB--exclude\fP settings. By default, a
|
||||
directory is read as a normal file; in some operating systems this gives an
|
||||
immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
||||
option to "recurse".
|
||||
.TP
|
||||
\fB--recursion-limit\fP=\fInumber\fP
|
||||
This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
|
||||
above for details.
|
||||
.TP
|
||||
\fB-s\fP, \fB--no-messages\fP
|
||||
Suppress error messages about non-existent or unreadable files. Such files are
|
||||
quietly skipped. However, the return code is still 2, even if matches were
|
||||
found in other files.
|
||||
.TP
|
||||
\fB-t\fP, \fB--total-count\fP
|
||||
This option is useful when scanning more than one file. If used on its own,
|
||||
\fB-t\fP suppresses all output except for a grand total number of matching
|
||||
lines (or non-matching lines if \fB-v\fP is used) in all the files. If \fB-t\fP
|
||||
is used with \fB-c\fP, a grand total is output except when the previous output
|
||||
is just one line. In other words, it is not output when just one file's count
|
||||
is listed. If file names are being output, the grand total is preceded by
|
||||
"TOTAL:". Otherwise, it appears as just another number. The \fB-t\fP option is
|
||||
ignored when used with \fB-L\fP (list files without matches), because the grand
|
||||
total would always be zero.
|
||||
.TP
|
||||
\fB-u\fP, \fB--utf\fP
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||
\fB--include\fP options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
.TP
|
||||
\fB-U\fP, \fB--utf-allow-invalid\fP
|
||||
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP(3)
|
||||
.\"
|
||||
documentation.
|
||||
.TP
|
||||
\fB-V\fP, \fB--version\fP
|
||||
Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
|
||||
standard output and then exit. Anything else on the command line is
|
||||
ignored.
|
||||
.TP
|
||||
\fB-v\fP, \fB--invert-match\fP
|
||||
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
.TP
|
||||
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||
Force the patterns only to match "words". That is, there must be a word
|
||||
boundary at the start and end of each matched string. This is equivalent to
|
||||
having "\eb(?:" at the start of each pattern, and ")\eb" at the end. This
|
||||
option applies only to the patterns that are matched against the contents of
|
||||
files; it does not apply to patterns specified by any of the \fB--include\fP or
|
||||
\fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
|
||||
Force the patterns to start matching only at the beginnings of lines, and in
|
||||
addition, require them to match entire lines. In multiline mode the match may
|
||||
be more than one line. This is equivalent to having "^(?:" at the start of each
|
||||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
.rs
|
||||
.sp
|
||||
The environment variables \fBLC_ALL\fP and \fBLC_CTYPE\fP are examined, in that
|
||||
order, for a locale. The first one that is set is used. This can be overridden
|
||||
by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default
|
||||
(usually the "C" locale) is used.
|
||||
.
|
||||
.
|
||||
.SH "NEWLINES"
|
||||
.rs
|
||||
.sp
|
||||
The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with
|
||||
newline conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation of files
|
||||
specified by the \fB-f\fP, \fB--file-list\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options.
|
||||
.P
|
||||
Any parts of the scanned input files that are written to the standard output
|
||||
are copied with whatever newline sequences they have in the input. However, if
|
||||
the final line of a file is output, and it does not end with a newline
|
||||
sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
|
||||
or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
|
||||
single NL is used.
|
||||
.P
|
||||
The newline setting does not affect the way in which \fBpcre2grep\fP writes
|
||||
newlines in informational messages to the standard output and error streams.
|
||||
Under Windows, the standard output is set to be binary, so that "\er\en" at the
|
||||
ends of output lines that are copied from the input is not converted to
|
||||
"\er\er\en" by the C I/O library. This means that any messages written to the
|
||||
standard output must end with "\er\en". For all other operating systems, and
|
||||
for all messages to the standard error stream, "\en" is used.
|
||||
.
|
||||
.
|
||||
.SH "OPTIONS COMPATIBILITY"
|
||||
.rs
|
||||
.sp
|
||||
Many of the short and long forms of \fBpcre2grep\fP's options are the same
|
||||
as in the GNU \fBgrep\fP program. Any long option of the form
|
||||
\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
|
||||
(PCRE2 terminology). However, the \fB--depth-limit\fP, \fB--file-list\fP,
|
||||
\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
|
||||
\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
|
||||
\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
|
||||
\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
|
||||
options are specific to \fBpcre2grep\fP, as is the use of the
|
||||
\fB--only-matching\fP option with a capturing parentheses number.
|
||||
.P
|
||||
Although most of the common options work the same way, a few are different in
|
||||
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
|
||||
for GNU \fBgrep\fP, but a regular expression for \fBpcre2grep\fP. If both the
|
||||
\fB-c\fP and \fB-l\fP options are given, GNU grep lists only file names,
|
||||
without counts, but \fBpcre2grep\fP gives the counts as well.
|
||||
.
|
||||
.
|
||||
.SH "OPTIONS WITH DATA"
|
||||
.rs
|
||||
.sp
|
||||
There are four different ways in which an option with data can be specified.
|
||||
If a short form option is used, the data may follow immediately, or (with one
|
||||
exception) in the next command line item. For example:
|
||||
.sp
|
||||
-f/some/file
|
||||
-f /some/file
|
||||
.sp
|
||||
The exception is the \fB-o\fP option, which may appear with or without data.
|
||||
Because of this, if data is present, it must follow immediately in the same
|
||||
item, for example -o3.
|
||||
.P
|
||||
If a long form option is used, the data may appear in the same command line
|
||||
item, separated by an equals character, or (with two exceptions) it may appear
|
||||
in the next command line item. For example:
|
||||
.sp
|
||||
--file=/some/file
|
||||
--file /some/file
|
||||
.sp
|
||||
Note, however, that if you want to supply a file name beginning with ~ as data
|
||||
in a shell command, and have the shell expand ~ to a home directory, you must
|
||||
separate the file name from the option, because the shell does not treat ~
|
||||
specially unless it is at the start of an item.
|
||||
.P
|
||||
The exceptions to the above are the \fB--colour\fP (or \fB--color\fP) and
|
||||
\fB--only-matching\fP options, for which the data is optional. If one of these
|
||||
options does have data, it must be given in the first form, using an equals
|
||||
character. Otherwise \fBpcre2grep\fP will assume that it has no data.
|
||||
.
|
||||
.
|
||||
.SH "USING PCRE2'S CALLOUT FACILITY"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2grep\fP has, by default, support for calling external programs or
|
||||
scripts or echoing specific strings during matching by making use of PCRE2's
|
||||
callout facility. However, this support can be completely or partially disabled
|
||||
when \fBpcre2grep\fP is built. You can find out whether your binary has support
|
||||
for callouts by running it with the \fB--help\fP option. If callout support is
|
||||
completely disabled, all callouts in patterns are ignored by \fBpcre2grep\fP.
|
||||
If the facility is partially disabled, calling external programs is not
|
||||
supported, and callouts that request it are ignored.
|
||||
.P
|
||||
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argument is
|
||||
either a number or a quoted string (see the
|
||||
.\" HREF
|
||||
\fBpcre2callout\fP
|
||||
.\"
|
||||
documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
|
||||
only callouts with string arguments are useful.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
.sp
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
.sp
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
.
|
||||
.
|
||||
.SS "Calling external programs or scripts"
|
||||
.rs
|
||||
.sp
|
||||
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
||||
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
||||
where \fBlib$spawn()\fP is used, and for any Unix-like environment where
|
||||
\fBfork()\fP and \fBexecv()\fP are available.
|
||||
.P
|
||||
If the callout string does not start with a pipe (vertical bar) character, it
|
||||
is parsed into a list of substrings separated by pipe characters. The first
|
||||
substring must be an executable name, with the following substrings specifying
|
||||
arguments:
|
||||
.sp
|
||||
executable_name|arg1|arg2|...
|
||||
.sp
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character. These are the same as for the \fB--output\fP
|
||||
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
.sp
|
||||
echo -e "abcde\en12345" | pcre2grep \e
|
||||
'(?x)(.)(..(.))
|
||||
(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
|
||||
.sp
|
||||
Output:
|
||||
.sp
|
||||
Arg1: [a] [bcd] [d] Arg2: |a| ()
|
||||
abcde
|
||||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
.sp
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING ERRORS"
|
||||
.rs
|
||||
.sp
|
||||
It is possible to supply a regular expression that takes a very long time to
|
||||
fail to match certain lines. Such patterns normally involve nested indefinite
|
||||
repeats, for example: (a+)*\ed when matched against a line of a's with no final
|
||||
digit. The PCRE2 matching function has a resource limit that causes it to abort
|
||||
in these circumstances. If this happens, \fBpcre2grep\fP outputs an error
|
||||
message and the line that caused the problem to the standard error stream. If
|
||||
there are more than 20 such errors, \fBpcre2grep\fP gives up.
|
||||
.P
|
||||
The \fB--match-limit\fP option of \fBpcre2grep\fP can be used to set the
|
||||
overall resource limit. There are also other limits that affect the amount of
|
||||
memory used during matching; see the discussion of \fB--heap-limit\fP and
|
||||
\fB--depth-limit\fP above.
|
||||
.
|
||||
.
|
||||
.SH DIAGNOSTICS
|
||||
.rs
|
||||
.sp
|
||||
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
|
||||
for syntax errors, overlong lines, non-existent or inaccessible files (even if
|
||||
matches were found in other files) or too many matching errors. Using the
|
||||
\fB-s\fP option to suppress error messages about inaccessible files does not
|
||||
affect the return code.
|
||||
.P
|
||||
When run under VMS, the return code is placed in the symbol PCRE2GREP_RC
|
||||
because VMS does not distinguish between exit(0) and exit(1).
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
|
||||
\fBpcre2unicode\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 21 November 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
4577
third_party/pcre/pcre2grep.c
vendored
Normal file
4577
third_party/pcre/pcre2grep.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
329
third_party/pcre/pcre2posix.3
vendored
Normal file
329
third_party/pcre/pcre2posix.3
vendored
Normal file
|
@ -0,0 +1,329 @@
|
|||
.TH PCRE2POSIX 3 "26 April 2021" "PCRE2 10.37"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SYNOPSIS"
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2posix.h>
|
||||
.PP
|
||||
.nf
|
||||
.B int pcre2_regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP,
|
||||
.B " int \fIcflags\fP);"
|
||||
.sp
|
||||
.B int pcre2_regexec(const regex_t *\fIpreg\fP, const char *\fIstring\fP,
|
||||
.B " size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);"
|
||||
.sp
|
||||
.B "size_t pcre2_regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP,"
|
||||
.B " char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);"
|
||||
.sp
|
||||
.B void pcre2_regfree(regex_t *\fIpreg\fP);
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This set of functions provides a POSIX-style API for the PCRE2 regular
|
||||
expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit
|
||||
and 32-bit libraries. See the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
documentation for a description of PCRE2's native API, which contains much
|
||||
additional functionality.
|
||||
.P
|
||||
The functions described here are wrapper functions that ultimately call the
|
||||
PCRE2 native API. Their prototypes are defined in the \fBpcre2posix.h\fP header
|
||||
file, and they all have unique names starting with \fBpcre2_\fP. However, the
|
||||
\fBpcre2posix.h\fP header also contains macro definitions that convert the
|
||||
standard POSIX names such \fBregcomp()\fP into \fBpcre2_regcomp()\fP etc. This
|
||||
means that a program can use the usual POSIX names without running the risk of
|
||||
accidentally linking with POSIX functions from a different library.
|
||||
.P
|
||||
On Unix-like systems the PCRE2 POSIX library is called \fBlibpcre2-posix\fP, so
|
||||
can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an
|
||||
application. Because the POSIX functions call the native ones, it is also
|
||||
necessary to add \fB-lpcre2-8\fP.
|
||||
.P
|
||||
Although they were not defined as protypes in \fBpcre2posix.h\fP, releases
|
||||
10.33 to 10.36 of the library contained functions with the POSIX names
|
||||
\fBregcomp()\fP etc. These simply passed their arguments to the PCRE2
|
||||
functions. These functions were provided for backwards compatibility with
|
||||
earlier versions of PCRE2, which had only POSIX names. However, this has proved
|
||||
troublesome in situations where a program links with several libraries, some of
|
||||
which use PCRE2's POSIX interface while others use the real POSIX functions.
|
||||
For this reason, the POSIX names have been removed since release 10.37.
|
||||
.P
|
||||
Calling the header file \fBpcre2posix.h\fP avoids any conflict with other POSIX
|
||||
libraries. It can, of course, be renamed or aliased as \fBregex.h\fP, which is
|
||||
the "correct" name, if there is no clash. It provides two structure types,
|
||||
\fIregex_t\fP for compiled internal forms, and \fIregmatch_t\fP for returning
|
||||
captured substrings. It also defines some constants whose names start with
|
||||
"REG_"; these are used for setting options and identifying error codes.
|
||||
.
|
||||
.
|
||||
.SH "USING THE POSIX FUNCTIONS"
|
||||
.rs
|
||||
.sp
|
||||
Those POSIX option bits that can reasonably be mapped to PCRE2 native options
|
||||
have been implemented. In addition, the option REG_EXTENDED is defined with the
|
||||
value zero. This has no effect, but since programs that are written to the
|
||||
POSIX interface often use it, this makes it easier to slot in PCRE2 as a
|
||||
replacement library. Other POSIX options are not even defined.
|
||||
.P
|
||||
There are also some options that are not defined by POSIX. These have been
|
||||
added at the request of users who want to make use of certain PCRE2-specific
|
||||
features via the POSIX calling interface or to add BSD or GNU functionality.
|
||||
.P
|
||||
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
||||
in style. The syntax and semantics of the regular expressions themselves are
|
||||
still those of Perl, subject to the setting of various PCRE2 options, as
|
||||
described below. "POSIX-like in style" means that the API approximates to the
|
||||
POSIX definition; it is not fully POSIX-compatible, and in multi-unit encoding
|
||||
domains it is probably even less compatible.
|
||||
.P
|
||||
The descriptions below use the actual names of the functions, but, as described
|
||||
above, the standard POSIX names (without the \fBpcre2_\fP prefix) may also be
|
||||
used.
|
||||
.
|
||||
.
|
||||
.SH "COMPILING A PATTERN"
|
||||
.rs
|
||||
.sp
|
||||
The function \fBpcre2_regcomp()\fP is called to compile a pattern into an
|
||||
internal form. By default, the pattern is a C string terminated by a binary
|
||||
zero (but see REG_PEND below). The \fIpreg\fP argument is a pointer to a
|
||||
\fBregex_t\fP structure that is used as a base for storing information about
|
||||
the compiled regular expression. (It is also used for input when REG_PEND is
|
||||
set.)
|
||||
.P
|
||||
The argument \fIcflags\fP is either zero, or contains one or more of the bits
|
||||
defined by the following macros:
|
||||
.sp
|
||||
REG_DOTALL
|
||||
.sp
|
||||
The PCRE2_DOTALL option is set when the regular expression is passed for
|
||||
compilation to the native function. Note that REG_DOTALL is not part of the
|
||||
POSIX standard.
|
||||
.sp
|
||||
REG_ICASE
|
||||
.sp
|
||||
The PCRE2_CASELESS option is set when the regular expression is passed for
|
||||
compilation to the native function.
|
||||
.sp
|
||||
REG_NEWLINE
|
||||
.sp
|
||||
The PCRE2_MULTILINE option is set when the regular expression is passed for
|
||||
compilation to the native function. Note that this does \fInot\fP mimic the
|
||||
defined POSIX behaviour for REG_NEWLINE (see the following section).
|
||||
.sp
|
||||
REG_NOSPEC
|
||||
.sp
|
||||
The PCRE2_LITERAL option is set when the regular expression is passed for
|
||||
compilation to the native function. This disables all meta characters in the
|
||||
pattern, causing it to be treated as a literal string. The only other options
|
||||
that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and
|
||||
REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard.
|
||||
.sp
|
||||
REG_NOSUB
|
||||
.sp
|
||||
When a pattern that is compiled with this flag is passed to
|
||||
\fBpcre2_regexec()\fP for matching, the \fInmatch\fP and \fIpmatch\fP arguments
|
||||
are ignored, and no captured strings are returned. Versions of the PCRE library
|
||||
prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this
|
||||
no longer happens because it disables the use of backreferences.
|
||||
.sp
|
||||
REG_PEND
|
||||
.sp
|
||||
If this option is set, the \fBreg_endp\fP field in the \fIpreg\fP structure
|
||||
(which has the type const char *) must be set to point to the character beyond
|
||||
the end of the pattern before calling \fBpcre2_regcomp()\fP. The pattern itself
|
||||
may now contain binary zeros, which are treated as data characters. Without
|
||||
REG_PEND, a binary zero terminates the pattern and the \fBre_endp\fP field is
|
||||
ignored. This is a GNU extension to the POSIX standard and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.sp
|
||||
REG_UCP
|
||||
.sp
|
||||
The PCRE2_UCP option is set when the regular expression is passed for
|
||||
compilation to the native function. This causes PCRE2 to use Unicode properties
|
||||
when matchine \ed, \ew, etc., instead of just recognizing ASCII values. Note
|
||||
that REG_UCP is not part of the POSIX standard.
|
||||
.sp
|
||||
REG_UNGREEDY
|
||||
.sp
|
||||
The PCRE2_UNGREEDY option is set when the regular expression is passed for
|
||||
compilation to the native function. Note that REG_UNGREEDY is not part of the
|
||||
POSIX standard.
|
||||
.sp
|
||||
REG_UTF
|
||||
.sp
|
||||
The PCRE2_UTF option is set when the regular expression is passed for
|
||||
compilation to the native function. This causes the pattern itself and all data
|
||||
strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF
|
||||
is not part of the POSIX standard.
|
||||
.P
|
||||
In the absence of these flags, no options are passed to the native function.
|
||||
This means the the regex is compiled with PCRE2 default semantics. In
|
||||
particular, the way it handles newline characters in the subject string is the
|
||||
Perl way, not the POSIX way. Note that setting PCRE2_MULTILINE has only
|
||||
\fIsome\fP of the effects specified for REG_NEWLINE. It does not affect the way
|
||||
newlines are matched by the dot metacharacter (they are not) or by a negative
|
||||
class such as [^a] (they are).
|
||||
.P
|
||||
The yield of \fBpcre2_regcomp()\fP is zero on success, and non-zero otherwise.
|
||||
The \fIpreg\fP structure is filled in on success, and one other member of the
|
||||
structure (as well as \fIre_endp\fP) is public: \fIre_nsub\fP contains the
|
||||
number of capturing subpatterns in the regular expression. Various error codes
|
||||
are defined in the header file.
|
||||
.P
|
||||
NOTE: If the yield of \fBpcre2_regcomp()\fP is non-zero, you must not attempt
|
||||
to use the contents of the \fIpreg\fP structure. If, for example, you pass it
|
||||
to \fBpcre2_regexec()\fP, the result is undefined and your program is likely to
|
||||
crash.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING NEWLINE CHARACTERS"
|
||||
.rs
|
||||
.sp
|
||||
This area is not simple, because POSIX and Perl take different views of things.
|
||||
It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was
|
||||
never intended to be a POSIX engine. The following table lists the different
|
||||
possibilities for matching newline characters in Perl and PCRE2:
|
||||
.sp
|
||||
Default Change with
|
||||
.sp
|
||||
. matches newline no PCRE2_DOTALL
|
||||
newline matches [^a] yes not changeable
|
||||
$ matches \en at end yes PCRE2_DOLLAR_ENDONLY
|
||||
$ matches \en in middle no PCRE2_MULTILINE
|
||||
^ matches \en in middle no PCRE2_MULTILINE
|
||||
.sp
|
||||
This is the equivalent table for a POSIX-compatible pattern matcher:
|
||||
.sp
|
||||
Default Change with
|
||||
.sp
|
||||
. matches newline yes REG_NEWLINE
|
||||
newline matches [^a] yes REG_NEWLINE
|
||||
$ matches \en at end no REG_NEWLINE
|
||||
$ matches \en in middle no REG_NEWLINE
|
||||
^ matches \en in middle no REG_NEWLINE
|
||||
.sp
|
||||
This behaviour is not what happens when PCRE2 is called via its POSIX
|
||||
API. By default, PCRE2's behaviour is the same as Perl's, except that there is
|
||||
no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there
|
||||
is no way to stop newline from matching [^a].
|
||||
.P
|
||||
Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and
|
||||
PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is
|
||||
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
||||
the POSIX API, passing REG_NEWLINE to PCRE2's \fBpcre2_regcomp()\fP function
|
||||
causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL
|
||||
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING A PATTERN"
|
||||
.rs
|
||||
.sp
|
||||
The function \fBpcre2_regexec()\fP is called to match a compiled pattern
|
||||
\fIpreg\fP against a given \fIstring\fP, which is by default terminated by a
|
||||
zero byte (but see REG_STARTEND below), subject to the options in \fIeflags\fP.
|
||||
These can be:
|
||||
.sp
|
||||
REG_NOTBOL
|
||||
.sp
|
||||
The PCRE2_NOTBOL option is set when calling the underlying PCRE2 matching
|
||||
function.
|
||||
.sp
|
||||
REG_NOTEMPTY
|
||||
.sp
|
||||
The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 matching
|
||||
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
|
||||
setting this option can give more POSIX-like behaviour in some situations.
|
||||
.sp
|
||||
REG_NOTEOL
|
||||
.sp
|
||||
The PCRE2_NOTEOL option is set when calling the underlying PCRE2 matching
|
||||
function.
|
||||
.sp
|
||||
REG_STARTEND
|
||||
.sp
|
||||
When this option is set, the subject string starts at \fIstring\fP +
|
||||
\fIpmatch[0].rm_so\fP and ends at \fIstring\fP + \fIpmatch[0].rm_eo\fP, which
|
||||
should point to the first character beyond the string. There may be binary
|
||||
zeros within the subject string, and indeed, using REG_STARTEND is the only
|
||||
way to pass a subject string that contains a binary zero.
|
||||
.P
|
||||
Whatever the value of \fIpmatch[0].rm_so\fP, the offsets of the matched string
|
||||
and any captured substrings are still given relative to the start of
|
||||
\fIstring\fP itself. (Before PCRE2 release 10.30 these were given relative to
|
||||
\fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other
|
||||
implementations.)
|
||||
.P
|
||||
This is a BSD extension, compatible with but not specified by IEEE Standard
|
||||
1003.2 (POSIX.2), and should be used with caution in software intended to be
|
||||
portable to other systems. Note that a non-zero \fIrm_so\fP does not imply
|
||||
REG_NOTBOL; REG_STARTEND affects only the location and length of the string,
|
||||
not how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL
|
||||
are mutually exclusive; the error REG_INVARG is returned.
|
||||
.P
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
|
||||
\fBpcre2_regexec()\fP are ignored (except possibly as input for REG_STARTEND).
|
||||
.P
|
||||
The value of \fInmatch\fP may be zero, and the value \fIpmatch\fP may be NULL
|
||||
(unless REG_STARTEND is set); in both these cases no data about any matched
|
||||
strings is returned.
|
||||
.P
|
||||
Otherwise, the portion of the string that was matched, and also any captured
|
||||
substrings, are returned via the \fIpmatch\fP argument, which points to an
|
||||
array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the
|
||||
members \fIrm_so\fP and \fIrm_eo\fP. These contain the byte offset to the first
|
||||
character of each substring and the offset to the first character after the end
|
||||
of each substring, respectively. The 0th element of the vector relates to the
|
||||
entire portion of \fIstring\fP that was matched; subsequent elements relate to
|
||||
the capturing subpatterns of the regular expression. Unused entries in the
|
||||
array have both structure members set to -1.
|
||||
.P
|
||||
A successful match yields a zero return; various error codes are defined in the
|
||||
header file, of which REG_NOMATCH is the "expected" failure code.
|
||||
.
|
||||
.
|
||||
.SH "ERROR MESSAGES"
|
||||
.rs
|
||||
.sp
|
||||
The \fBpcre2_regerror()\fP function maps a non-zero errorcode from either
|
||||
\fBpcre2_regcomp()\fP or \fBpcre2_regexec()\fP to a printable message. If
|
||||
\fIpreg\fP is not NULL, the error should have arisen from the use of that
|
||||
structure. A message terminated by a binary zero is placed in \fIerrbuf\fP. If
|
||||
the buffer is too short, only the first \fIerrbuf_size\fP - 1 characters of the
|
||||
error message are used. The yield of the function is the size of buffer needed
|
||||
to hold the whole message, including the terminating zero. This value is
|
||||
greater than \fIerrbuf_size\fP if the message was truncated.
|
||||
.
|
||||
.
|
||||
.SH MEMORY USAGE
|
||||
.rs
|
||||
.sp
|
||||
Compiling a regular expression causes memory to be allocated and associated
|
||||
with the \fIpreg\fP structure. The function \fBpcre2_regfree()\fP frees all
|
||||
such memory, after which \fIpreg\fP may no longer be used as a compiled
|
||||
expression.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 26 April 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
435
third_party/pcre/pcre2posix.c
vendored
Normal file
435
third_party/pcre/pcre2posix.c
vendored
Normal file
|
@ -0,0 +1,435 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module is a wrapper that provides a POSIX API to the underlying PCRE2
|
||||
functions. The operative functions are called pcre2_regcomp(), etc., with
|
||||
wrappers that use the plain POSIX names. In addition, pcre2posix.h defines the
|
||||
POSIX names as macros for the pcre2_xxx functions, so any program that includes
|
||||
it and uses the POSIX names will call the base functions directly. This makes
|
||||
it easier for an application to be sure it gets the PCRE2 versions in the
|
||||
presence of other POSIX regex libraries. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
/* Ensure that the PCRE2POSIX_EXP_xxx macros are set appropriately for
|
||||
compiling these functions. This must come before including pcre2posix.h, where
|
||||
they are set for an application (using these functions) if they have not
|
||||
previously been set. */
|
||||
|
||||
#if defined(_WIN32) && !defined(PCRE2_STATIC)
|
||||
# define PCRE2POSIX_EXP_DECL extern __declspec(dllexport)
|
||||
# define PCRE2POSIX_EXP_DEFN __declspec(dllexport)
|
||||
#endif
|
||||
|
||||
/* Older versions of MSVC lack snprintf(). This define allows for
|
||||
warning/error-free compilation and testing with MSVC compilers back to at least
|
||||
MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
|
||||
|
||||
/* Compile-time error numbers start at this value. It should probably never be
|
||||
changed. This #define is a copy of the one in pcre2_internal.h. */
|
||||
|
||||
#define COMPILE_ERROR_BASE 100
|
||||
|
||||
|
||||
/* Standard C headers */
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* PCRE2 headers */
|
||||
|
||||
#include "pcre2.h"
|
||||
#include "pcre2posix.h"
|
||||
|
||||
/* Table to translate PCRE2 compile time error codes into POSIX error codes.
|
||||
Only a few PCRE2 errors with a value greater than 23 turn into special POSIX
|
||||
codes: most go to REG_BADPAT. The second table lists, in pairs, those that
|
||||
don't. */
|
||||
|
||||
static const int eint1[] = {
|
||||
0, /* No error */
|
||||
REG_EESCAPE, /* \ at end of pattern */
|
||||
REG_EESCAPE, /* \c at end of pattern */
|
||||
REG_EESCAPE, /* unrecognized character follows \ */
|
||||
REG_BADBR, /* numbers out of order in {} quantifier */
|
||||
/* 5 */
|
||||
REG_BADBR, /* number too big in {} quantifier */
|
||||
REG_EBRACK, /* missing terminating ] for character class */
|
||||
REG_ECTYPE, /* invalid escape sequence in character class */
|
||||
REG_ERANGE, /* range out of order in character class */
|
||||
REG_BADRPT, /* nothing to repeat */
|
||||
/* 10 */
|
||||
REG_ASSERT, /* internal error: unexpected repeat */
|
||||
REG_BADPAT, /* unrecognized character after (? or (?- */
|
||||
REG_BADPAT, /* POSIX named classes are supported only within a class */
|
||||
REG_BADPAT, /* POSIX collating elements are not supported */
|
||||
REG_EPAREN, /* missing ) */
|
||||
/* 15 */
|
||||
REG_ESUBREG, /* reference to non-existent subpattern */
|
||||
REG_INVARG, /* pattern passed as NULL */
|
||||
REG_INVARG, /* unknown compile-time option bit(s) */
|
||||
REG_EPAREN, /* missing ) after (?# comment */
|
||||
REG_ESIZE, /* parentheses nested too deeply */
|
||||
/* 20 */
|
||||
REG_ESIZE, /* regular expression too large */
|
||||
REG_ESPACE, /* failed to get memory */
|
||||
REG_EPAREN, /* unmatched closing parenthesis */
|
||||
REG_ASSERT /* internal error: code overflow */
|
||||
};
|
||||
|
||||
static const int eint2[] = {
|
||||
30, REG_ECTYPE, /* unknown POSIX class name */
|
||||
32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
|
||||
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
|
||||
56, REG_INVARG, /* internal error: unknown newline setting */
|
||||
92, REG_INVARG, /* invalid option bits with PCRE2_LITERAL */
|
||||
99, REG_EESCAPE /* \K in lookaround */
|
||||
};
|
||||
|
||||
/* Table of texts corresponding to POSIX error codes */
|
||||
|
||||
static const char *const pstring[] = {
|
||||
"", /* Dummy for value 0 */
|
||||
"internal error", /* REG_ASSERT */
|
||||
"invalid repeat counts in {}", /* BADBR */
|
||||
"pattern error", /* BADPAT */
|
||||
"? * + invalid", /* BADRPT */
|
||||
"unbalanced {}", /* EBRACE */
|
||||
"unbalanced []", /* EBRACK */
|
||||
"collation error - not relevant", /* ECOLLATE */
|
||||
"bad class", /* ECTYPE */
|
||||
"bad escape sequence", /* EESCAPE */
|
||||
"empty expression", /* EMPTY */
|
||||
"unbalanced ()", /* EPAREN */
|
||||
"bad range inside []", /* ERANGE */
|
||||
"expression too big", /* ESIZE */
|
||||
"failed to get memory", /* ESPACE */
|
||||
"bad back reference", /* ESUBREG */
|
||||
"bad argument", /* INVARG */
|
||||
"match failed" /* NOMATCH */
|
||||
};
|
||||
|
||||
|
||||
|
||||
#if 0 /* REMOVE THIS CODE */
|
||||
|
||||
The code below was created for 10.33 (see ChangeLog 10.33 #4) when the
|
||||
POSIX functions were given pcre2_... names instead of the traditional POSIX
|
||||
names. However, it has proved to be more troublesome than useful. There have
|
||||
been at least two cases where a program links with two others, one of which
|
||||
uses the POSIX library and the other uses the PCRE2 POSIX functions, thus
|
||||
causing two instances of the POSIX runctions to exist, leading to trouble. For
|
||||
10.37 this code is commented out. In due course it can be removed if there are
|
||||
no issues. The only small worry is the comment below about languages that do
|
||||
not include pcre2posix.h. If there are any such cases, they will have to use
|
||||
the PCRE2 names.
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Wrappers with traditional POSIX names *
|
||||
*************************************************/
|
||||
|
||||
/* Keep defining them to preseve the ABI for applications linked to the pcre2
|
||||
POSIX library before these names were changed into macros in pcre2posix.h.
|
||||
This also ensures that the POSIX names are callable from languages that do not
|
||||
include pcre2posix.h. It is vital to #undef the macro definitions from
|
||||
pcre2posix.h! */
|
||||
|
||||
#undef regerror
|
||||
PCRE2POSIX_EXP_DECL size_t regerror(int, const regex_t *, char *, size_t);
|
||||
PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION
|
||||
regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
|
||||
{
|
||||
return pcre2_regerror(errcode, preg, errbuf, errbuf_size);
|
||||
}
|
||||
|
||||
#undef regfree
|
||||
PCRE2POSIX_EXP_DECL void regfree(regex_t *);
|
||||
PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
regfree(regex_t *preg)
|
||||
{
|
||||
pcre2_regfree(preg);
|
||||
}
|
||||
|
||||
#undef regcomp
|
||||
PCRE2POSIX_EXP_DECL int regcomp(regex_t *, const char *, int);
|
||||
PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
regcomp(regex_t *preg, const char *pattern, int cflags)
|
||||
{
|
||||
return pcre2_regcomp(preg, pattern, cflags);
|
||||
}
|
||||
|
||||
#undef regexec
|
||||
PCRE2POSIX_EXP_DECL int regexec(const regex_t *, const char *, size_t,
|
||||
regmatch_t *, int);
|
||||
PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
regexec(const regex_t *preg, const char *string, size_t nmatch,
|
||||
regmatch_t pmatch[], int eflags)
|
||||
{
|
||||
return pcre2_regexec(preg, string, nmatch, pmatch, eflags);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Translate error code to string *
|
||||
*************************************************/
|
||||
|
||||
PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION
|
||||
pcre2_regerror(int errcode, const regex_t *preg, char *errbuf,
|
||||
size_t errbuf_size)
|
||||
{
|
||||
int used;
|
||||
const char *message;
|
||||
|
||||
message = (errcode <= 0 || errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
|
||||
"unknown error code" : pstring[errcode];
|
||||
|
||||
if (preg != NULL && (int)preg->re_erroffset != -1)
|
||||
{
|
||||
used = snprintf(errbuf, errbuf_size, "%s at offset %-6d", message,
|
||||
(int)preg->re_erroffset);
|
||||
}
|
||||
else
|
||||
{
|
||||
used = snprintf(errbuf, errbuf_size, "%s", message);
|
||||
}
|
||||
|
||||
return used + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free store held by a regex *
|
||||
*************************************************/
|
||||
|
||||
PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_regfree(regex_t *preg)
|
||||
{
|
||||
pcre2_match_data_free(preg->re_match_data);
|
||||
pcre2_code_free(preg->re_pcre2_code);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compile a regular expression *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
preg points to a structure for recording the compiled expression
|
||||
pattern the pattern to compile
|
||||
cflags compilation flags
|
||||
|
||||
Returns: 0 on success
|
||||
various non-zero codes on failure
|
||||
*/
|
||||
|
||||
PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_regcomp(regex_t *preg, const char *pattern, int cflags)
|
||||
{
|
||||
PCRE2_SIZE erroffset;
|
||||
PCRE2_SIZE patlen;
|
||||
int errorcode;
|
||||
int options = 0;
|
||||
int re_nsub = 0;
|
||||
|
||||
patlen = ((cflags & REG_PEND) != 0)? (PCRE2_SIZE)(preg->re_endp - pattern) :
|
||||
PCRE2_ZERO_TERMINATED;
|
||||
|
||||
if ((cflags & REG_ICASE) != 0) options |= PCRE2_CASELESS;
|
||||
if ((cflags & REG_NEWLINE) != 0) options |= PCRE2_MULTILINE;
|
||||
if ((cflags & REG_DOTALL) != 0) options |= PCRE2_DOTALL;
|
||||
if ((cflags & REG_NOSPEC) != 0) options |= PCRE2_LITERAL;
|
||||
if ((cflags & REG_UTF) != 0) options |= PCRE2_UTF;
|
||||
if ((cflags & REG_UCP) != 0) options |= PCRE2_UCP;
|
||||
if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY;
|
||||
|
||||
preg->re_cflags = cflags;
|
||||
preg->re_pcre2_code = pcre2_compile((PCRE2_SPTR)pattern, patlen, options,
|
||||
&errorcode, &erroffset, NULL);
|
||||
preg->re_erroffset = erroffset;
|
||||
|
||||
if (preg->re_pcre2_code == NULL)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/* A negative value is a UTF error; otherwise all error codes are greater
|
||||
than COMPILE_ERROR_BASE, but check, just in case. */
|
||||
|
||||
if (errorcode < COMPILE_ERROR_BASE) return REG_BADPAT;
|
||||
errorcode -= COMPILE_ERROR_BASE;
|
||||
|
||||
if (errorcode < (int)(sizeof(eint1)/sizeof(const int)))
|
||||
return eint1[errorcode];
|
||||
for (i = 0; i < sizeof(eint2)/sizeof(const int); i += 2)
|
||||
if (errorcode == eint2[i]) return eint2[i+1];
|
||||
return REG_BADPAT;
|
||||
}
|
||||
|
||||
(void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code,
|
||||
PCRE2_INFO_CAPTURECOUNT, &re_nsub);
|
||||
preg->re_nsub = (size_t)re_nsub;
|
||||
preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL);
|
||||
preg->re_erroffset = (size_t)(-1); /* No meaning after successful compile */
|
||||
|
||||
if (preg->re_match_data == NULL)
|
||||
{
|
||||
/* LCOV_EXCL_START */
|
||||
pcre2_code_free(preg->re_pcre2_code);
|
||||
return REG_ESPACE;
|
||||
/* LCOV_EXCL_STOP */
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Match a regular expression *
|
||||
*************************************************/
|
||||
|
||||
/* A suitable match_data block, large enough to hold all possible captures, was
|
||||
obtained when the pattern was compiled, to save having to allocate and free it
|
||||
for each match. If REG_NOSUB was specified at compile time, the nmatch and
|
||||
pmatch arguments are ignored, and the only result is yes/no/error. */
|
||||
|
||||
PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_regexec(const regex_t *preg, const char *string, size_t nmatch,
|
||||
regmatch_t pmatch[], int eflags)
|
||||
{
|
||||
int rc, so, eo;
|
||||
int options = 0;
|
||||
pcre2_match_data *md = (pcre2_match_data *)preg->re_match_data;
|
||||
|
||||
if (string == NULL) return REG_INVARG;
|
||||
|
||||
if ((eflags & REG_NOTBOL) != 0) options |= PCRE2_NOTBOL;
|
||||
if ((eflags & REG_NOTEOL) != 0) options |= PCRE2_NOTEOL;
|
||||
if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE2_NOTEMPTY;
|
||||
|
||||
/* When REG_NOSUB was specified, or if no vector has been passed in which to
|
||||
put captured strings, ensure that nmatch is zero. This will stop any attempt to
|
||||
write to pmatch. */
|
||||
|
||||
if ((preg->re_cflags & REG_NOSUB) != 0 || pmatch == NULL) nmatch = 0;
|
||||
|
||||
/* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
|
||||
The man page from OS X says "REG_STARTEND affects only the location of the
|
||||
string, not how it is matched". That is why the "so" value is used to bump the
|
||||
start location rather than being passed as a PCRE2 "starting offset". */
|
||||
|
||||
if ((eflags & REG_STARTEND) != 0)
|
||||
{
|
||||
if (pmatch == NULL) return REG_INVARG;
|
||||
so = pmatch[0].rm_so;
|
||||
eo = pmatch[0].rm_eo;
|
||||
}
|
||||
else
|
||||
{
|
||||
so = 0;
|
||||
eo = (int)strlen(string);
|
||||
}
|
||||
|
||||
rc = pcre2_match((const pcre2_code *)preg->re_pcre2_code,
|
||||
(PCRE2_SPTR)string + so, (eo - so), 0, options, md, NULL);
|
||||
|
||||
/* Successful match */
|
||||
|
||||
if (rc >= 0)
|
||||
{
|
||||
size_t i;
|
||||
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md);
|
||||
if ((size_t)rc > nmatch) rc = (int)nmatch;
|
||||
for (i = 0; i < (size_t)rc; i++)
|
||||
{
|
||||
pmatch[i].rm_so = (ovector[i*2] == PCRE2_UNSET)? -1 :
|
||||
(int)(ovector[i*2] + so);
|
||||
pmatch[i].rm_eo = (ovector[i*2+1] == PCRE2_UNSET)? -1 :
|
||||
(int)(ovector[i*2+1] + so);
|
||||
}
|
||||
for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Unsuccessful match */
|
||||
|
||||
if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
|
||||
return REG_INVARG;
|
||||
|
||||
/* Most of these are events that won't occur during testing, so exclude them
|
||||
from coverage. */
|
||||
|
||||
switch(rc)
|
||||
{
|
||||
case PCRE2_ERROR_HEAPLIMIT: return REG_ESPACE;
|
||||
case PCRE2_ERROR_NOMATCH: return REG_NOMATCH;
|
||||
|
||||
/* LCOV_EXCL_START */
|
||||
case PCRE2_ERROR_BADMODE: return REG_INVARG;
|
||||
case PCRE2_ERROR_BADMAGIC: return REG_INVARG;
|
||||
case PCRE2_ERROR_BADOPTION: return REG_INVARG;
|
||||
case PCRE2_ERROR_BADUTFOFFSET: return REG_INVARG;
|
||||
case PCRE2_ERROR_MATCHLIMIT: return REG_ESPACE;
|
||||
case PCRE2_ERROR_NOMEMORY: return REG_ESPACE;
|
||||
case PCRE2_ERROR_NULL: return REG_INVARG;
|
||||
default: return REG_ASSERT;
|
||||
/* LCOV_EXCL_STOP */
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2posix.c */
|
184
third_party/pcre/pcre2posix.h
vendored
Normal file
184
third_party/pcre/pcre2posix.h
vendored
Normal file
|
@ -0,0 +1,184 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE2 is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language. This is
|
||||
the public header file to be #included by applications that call PCRE2 via the
|
||||
POSIX wrapper interface.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* Have to include stdlib.h in order to ensure that size_t is defined. */
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
/* Allow for C++ users */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Options, mostly defined by POSIX, but with some extras. */
|
||||
|
||||
#define REG_ICASE 0x0001 /* Maps to PCRE2_CASELESS */
|
||||
#define REG_NEWLINE 0x0002 /* Maps to PCRE2_MULTILINE */
|
||||
#define REG_NOTBOL 0x0004 /* Maps to PCRE2_NOTBOL */
|
||||
#define REG_NOTEOL 0x0008 /* Maps to PCRE2_NOTEOL */
|
||||
#define REG_DOTALL 0x0010 /* NOT defined by POSIX; maps to PCRE2_DOTALL */
|
||||
#define REG_NOSUB 0x0020 /* Do not report what was matched */
|
||||
#define REG_UTF 0x0040 /* NOT defined by POSIX; maps to PCRE2_UTF */
|
||||
#define REG_STARTEND 0x0080 /* BSD feature: pass subject string by so,eo */
|
||||
#define REG_NOTEMPTY 0x0100 /* NOT defined by POSIX; maps to PCRE2_NOTEMPTY */
|
||||
#define REG_UNGREEDY 0x0200 /* NOT defined by POSIX; maps to PCRE2_UNGREEDY */
|
||||
#define REG_UCP 0x0400 /* NOT defined by POSIX; maps to PCRE2_UCP */
|
||||
#define REG_PEND 0x0800 /* GNU feature: pass end pattern by re_endp */
|
||||
#define REG_NOSPEC 0x1000 /* Maps to PCRE2_LITERAL */
|
||||
|
||||
/* This is not used by PCRE2, but by defining it we make it easier
|
||||
to slot PCRE2 into existing programs that make POSIX calls. */
|
||||
|
||||
#define REG_EXTENDED 0
|
||||
|
||||
/* Error values. Not all these are relevant or used by the wrapper. */
|
||||
|
||||
enum {
|
||||
REG_ASSERT = 1, /* internal error ? */
|
||||
REG_BADBR, /* invalid repeat counts in {} */
|
||||
REG_BADPAT, /* pattern error */
|
||||
REG_BADRPT, /* ? * + invalid */
|
||||
REG_EBRACE, /* unbalanced {} */
|
||||
REG_EBRACK, /* unbalanced [] */
|
||||
REG_ECOLLATE, /* collation error - not relevant */
|
||||
REG_ECTYPE, /* bad class */
|
||||
REG_EESCAPE, /* bad escape sequence */
|
||||
REG_EMPTY, /* empty expression */
|
||||
REG_EPAREN, /* unbalanced () */
|
||||
REG_ERANGE, /* bad range inside [] */
|
||||
REG_ESIZE, /* expression too big */
|
||||
REG_ESPACE, /* failed to get memory */
|
||||
REG_ESUBREG, /* bad back reference */
|
||||
REG_INVARG, /* bad argument */
|
||||
REG_NOMATCH /* match failed */
|
||||
};
|
||||
|
||||
|
||||
/* The structure representing a compiled regular expression. It is also used
|
||||
for passing the pattern end pointer when REG_PEND is set. */
|
||||
|
||||
typedef struct {
|
||||
void *re_pcre2_code;
|
||||
void *re_match_data;
|
||||
const char *re_endp;
|
||||
size_t re_nsub;
|
||||
size_t re_erroffset;
|
||||
int re_cflags;
|
||||
} regex_t;
|
||||
|
||||
/* The structure in which a captured offset is returned. */
|
||||
|
||||
typedef int regoff_t;
|
||||
|
||||
typedef struct {
|
||||
regoff_t rm_so;
|
||||
regoff_t rm_eo;
|
||||
} regmatch_t;
|
||||
|
||||
/* When compiling with the MSVC compiler, it is sometimes necessary to include
|
||||
a "calling convention" before exported function names. (This is secondhand
|
||||
information; I know nothing about MSVC myself). For example, something like
|
||||
|
||||
void __cdecl function(....)
|
||||
|
||||
might be needed. In order to make this easy, all the exported functions have
|
||||
PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
|
||||
set, we ensure here that it has no effect. */
|
||||
|
||||
#ifndef PCRE2_CALL_CONVENTION
|
||||
#define PCRE2_CALL_CONVENTION
|
||||
#endif
|
||||
|
||||
/* When an application links to a PCRE2 DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
export settings are needed, and are set in pcre2posix.c before including this
|
||||
file. */
|
||||
|
||||
#if defined(_WIN32) && !defined(PCRE2_STATIC) && !defined(PCRE2POSIX_EXP_DECL)
|
||||
# define PCRE2POSIX_EXP_DECL extern __declspec(dllimport)
|
||||
# define PCRE2POSIX_EXP_DEFN __declspec(dllimport)
|
||||
#endif
|
||||
|
||||
/* By default, we use the standard "extern" declarations. */
|
||||
|
||||
#ifndef PCRE2POSIX_EXP_DECL
|
||||
# ifdef __cplusplus
|
||||
# define PCRE2POSIX_EXP_DECL extern "C"
|
||||
# define PCRE2POSIX_EXP_DEFN extern "C"
|
||||
# else
|
||||
# define PCRE2POSIX_EXP_DECL extern
|
||||
# define PCRE2POSIX_EXP_DEFN extern
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* The functions. The actual code is in functions with pcre2_xxx names for
|
||||
uniqueness. POSIX names are provided as macros for API compatibility with POSIX
|
||||
regex functions. It's done this way to ensure to they are always linked from
|
||||
the PCRE2 library and not by accident from elsewhere (regex_t differs in size
|
||||
elsewhere). */
|
||||
|
||||
PCRE2POSIX_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_regcomp(regex_t *, const char *, int);
|
||||
PCRE2POSIX_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_regexec(const regex_t *, const char *, size_t,
|
||||
regmatch_t *, int);
|
||||
PCRE2POSIX_EXP_DECL size_t PCRE2_CALL_CONVENTION pcre2_regerror(int, const regex_t *, char *, size_t);
|
||||
PCRE2POSIX_EXP_DECL void PCRE2_CALL_CONVENTION pcre2_regfree(regex_t *);
|
||||
|
||||
#define regcomp pcre2_regcomp
|
||||
#define regexec pcre2_regexec
|
||||
#define regerror pcre2_regerror
|
||||
#define regfree pcre2_regfree
|
||||
|
||||
/* Debian had a patch that used different names. These are now here to save
|
||||
them having to maintain their own patch, but are not documented by PCRE2. */
|
||||
|
||||
#define PCRE2regcomp pcre2_regcomp
|
||||
#define PCRE2regexec pcre2_regexec
|
||||
#define PCRE2regerror pcre2_regerror
|
||||
#define PCRE2regfree pcre2_regfree
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
/* End of pcre2posix.h */
|
209
third_party/pcre/pcre2posix_test.c
vendored
Normal file
209
third_party/pcre/pcre2posix_test.c
vendored
Normal file
|
@ -0,0 +1,209 @@
|
|||
/*************************************************
|
||||
* PCRE2 POSIX interface test program *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Written by Philip Hazel, December 2022
|
||||
Copyright (c) 2022
|
||||
File last edited: December 2022
|
||||
|
||||
This program tests the POSIX wrapper to the PCRE2 regular expression library.
|
||||
The main PCRE2 test program is pcre2test, which also tests these function
|
||||
calls. This little program is needed to test the case where the client includes
|
||||
pcre2posix.h but not pcre2.h, mainly to make sure that it builds successfully.
|
||||
However, the code is written as a flexible test program to which extra tests
|
||||
can be added.
|
||||
|
||||
Compile with -lpcre2-posix -lpcre2-8
|
||||
|
||||
If run with no options, there is no output on success, and the return code is
|
||||
zero. If any test fails there is output to stderr, and the return code is 1.
|
||||
|
||||
For testing purposes, the "-v" option causes verification output to be written
|
||||
to stdout. */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "pcre2posix.h"
|
||||
|
||||
#define CAPCOUNT 5 /* Number of captures supported */
|
||||
#define PRINTF if (v) printf /* Shorthand for testing output */
|
||||
|
||||
/* This vector contains compiler flags for each pattern that is tested. */
|
||||
|
||||
static int cflags[] = {
|
||||
0, /* Test 0 */
|
||||
REG_ICASE, /* Test 1 */
|
||||
0, /* Test 2 */
|
||||
REG_NEWLINE, /* Test 3 */
|
||||
0 /* Test 4 */
|
||||
};
|
||||
|
||||
/* This vector contains match flags for each pattern that is tested. */
|
||||
|
||||
static int mflags[] = {
|
||||
0, /* Test 0 */
|
||||
0, /* Test 1 */
|
||||
0, /* Test 2 */
|
||||
REG_NOTBOL, /* Test 3 */
|
||||
0 /* Test 4 */
|
||||
};
|
||||
|
||||
/* Automate the number of patterns */
|
||||
|
||||
#define count (int)(sizeof(cflags)/sizeof(int))
|
||||
|
||||
/* The data for each pattern consists of a pattern string, followed by any
|
||||
number of subject strings, terminated by NULL. Some tests share data, but use
|
||||
different flags. */
|
||||
|
||||
static const char *data0_1[] = { "posix", "lower posix", "upper POSIX", NULL };
|
||||
static const char *data2_3[] = { "^(cat|dog)", "catastrophic\ncataclysm",
|
||||
"dogfight", "no animals", NULL };
|
||||
static const char *data4[] = { "*badpattern", NULL };
|
||||
|
||||
/* Index the data strings */
|
||||
|
||||
static char **data[] = {
|
||||
(char **)(&data0_1),
|
||||
(char **)(&data0_1),
|
||||
(char **)(&data2_3),
|
||||
(char **)(&data2_3),
|
||||
(char **)(&data4)
|
||||
};
|
||||
|
||||
/* The expected results for each pattern consist of a compiler return code,
|
||||
optionally followed, for each subject string, by a match return code and, for a
|
||||
successful match, up to CAPCOUNT pairs of returned match data. */
|
||||
|
||||
static int results0[] = {
|
||||
0, /* Compiler rc */
|
||||
0, 6, 11, /* 1st match */
|
||||
REG_NOMATCH /* 2nd match */
|
||||
};
|
||||
|
||||
static int results1[] = {
|
||||
0, /* Compiler rc */
|
||||
0, 6, 11, /* 1st match */
|
||||
0, 6, 11 /* 2nd match */
|
||||
};
|
||||
|
||||
static int results2[] = {
|
||||
0, /* Compiler rc */
|
||||
0, 0, 3, 0, 3, /* 1st match */
|
||||
0, 0, 3, 0, 3, /* 2nd match */
|
||||
REG_NOMATCH /* 3rd match */
|
||||
};
|
||||
|
||||
static int results3[] = {
|
||||
0, /* Compiler rc */
|
||||
0, 13, 16, 13, 16, /* 1st match */
|
||||
REG_NOMATCH, /* 2nd match */
|
||||
REG_NOMATCH /* 3rd match */
|
||||
};
|
||||
|
||||
static int results4[] = {
|
||||
REG_BADRPT /* Compiler rc */
|
||||
};
|
||||
|
||||
/* Index the result vectors */
|
||||
|
||||
static int *results[] = {
|
||||
(int *)(&results0),
|
||||
(int *)(&results1),
|
||||
(int *)(&results2),
|
||||
(int *)(&results3),
|
||||
(int *)(&results4)
|
||||
};
|
||||
|
||||
/* And here is the program */
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
regex_t re;
|
||||
regmatch_t match[CAPCOUNT];
|
||||
int v = argc > 1 && strcmp(argv[1], "-v") == 0;
|
||||
|
||||
PRINTF("Test of pcre2posix.h without pcre2.h\n");
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
char *pattern = data[i][0];
|
||||
char **subjects = data[i] + 1;
|
||||
int *rd = results[i];
|
||||
int rc = regcomp(&re, pattern, cflags[i]);
|
||||
|
||||
PRINTF("Pattern: %s flags=0x%02x\n", pattern, cflags[i]);
|
||||
|
||||
if (rc != *rd)
|
||||
{
|
||||
fprintf(stderr, "Unexpected compile error %d (expected %d)\n", rc, *rd);
|
||||
fprintf(stderr, "Pattern is: %s\n", pattern);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (rc != 0)
|
||||
{
|
||||
if (v)
|
||||
{
|
||||
char buffer[256];
|
||||
(void)regerror(rc, &re, buffer, sizeof(buffer));
|
||||
PRINTF("Compile error %d: %s (expected)\n", rc, buffer);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for (; *subjects != NULL; subjects++)
|
||||
{
|
||||
rc = regexec(&re, *subjects, CAPCOUNT, match, mflags[i]);
|
||||
|
||||
PRINTF("Subject: %s\n", *subjects);
|
||||
PRINTF("Return: %d", rc);
|
||||
|
||||
if (rc != *(++rd))
|
||||
{
|
||||
PRINTF("\n");
|
||||
fprintf(stderr, "Unexpected match error %d (expected %d)\n", rc, *rd);
|
||||
fprintf(stderr, "Pattern is: %s\n", pattern);
|
||||
fprintf(stderr, "Subject is: %s\n", *subjects);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (rc == 0)
|
||||
{
|
||||
for (int j = 0; j < CAPCOUNT; j++)
|
||||
{
|
||||
regmatch_t *m = match + j;
|
||||
if (m->rm_so < 0) continue;
|
||||
if (m->rm_so != *(++rd) || m->rm_eo != *(++rd))
|
||||
{
|
||||
PRINTF("\n");
|
||||
fprintf(stderr, "Mismatched results for successful match\n");
|
||||
fprintf(stderr, "Pattern is: %s\n", pattern);
|
||||
fprintf(stderr, "Subject is: %s\n", *subjects);
|
||||
fprintf(stderr, "Result %d: expected %d %d received %d %d\n",
|
||||
j, rd[-1], rd[0], m->rm_so, m->rm_eo);
|
||||
return 1;
|
||||
}
|
||||
PRINTF(" (%d %d %d)", j, m->rm_so, m->rm_eo);
|
||||
}
|
||||
}
|
||||
|
||||
else if (v)
|
||||
{
|
||||
char buffer[256];
|
||||
(void)regerror(rc, &re, buffer, sizeof(buffer));
|
||||
PRINTF(": %s (expected)", buffer);
|
||||
}
|
||||
|
||||
PRINTF("\n");
|
||||
}
|
||||
|
||||
regfree(&re);
|
||||
}
|
||||
|
||||
PRINTF("End of test\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End of pcre2posix_test.c */
|
Loading…
Reference in a new issue