925 lines
32 KiB
ObjectPascal
925 lines
32 KiB
ObjectPascal
|
{**************************************************************************************************}
|
||
|
{ }
|
||
|
{ Perl Regular Expressions VCL component }
|
||
|
{ }
|
||
|
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
|
||
|
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
|
||
|
{ License at http://www.mozilla.org/MPL/ }
|
||
|
{ }
|
||
|
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
|
||
|
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
|
||
|
{ and limitations under the License. }
|
||
|
{ }
|
||
|
{ The Original Code is PerlRegEx.pas. }
|
||
|
{ }
|
||
|
{ The Initial Developer of the Original Code is Jan Goyvaerts. }
|
||
|
{ Portions created by Jan Goyvaerts are Copyright (C) 1999, 2005, 2008 Jan Goyvaerts. }
|
||
|
{ All rights reserved. }
|
||
|
{ }
|
||
|
{ Design & implementation, by Jan Goyvaerts, 1999, 2005, 2008 }
|
||
|
{ }
|
||
|
{ TPerlRegEx is available at http://www.regular-expressions.info/delphi.html }
|
||
|
{ }
|
||
|
{**************************************************************************************************}
|
||
|
|
||
|
unit PerlRegEx;
|
||
|
|
||
|
interface
|
||
|
|
||
|
uses
|
||
|
Windows, Messages, SysUtils, Classes,
|
||
|
pcre;
|
||
|
|
||
|
type
|
||
|
TPerlRegExOptions = set of (
|
||
|
preCaseLess, // /i -> Case insensitive
|
||
|
preMultiLine, // /m -> ^ and $ also match before/after a newline, not just at the beginning and the end of the PCREString
|
||
|
preSingleLine, // /s -> Dot matches any character, including \n (newline). Otherwise, it matches anything except \n
|
||
|
preExtended, // /x -> Allow regex to contain extra whitespace, newlines and Perl-style comments, all of which will be filtered out
|
||
|
preAnchored, // /A -> Successful match can only occur at the start of the subject or right after the previous match
|
||
|
preUnGreedy, // Repeat operators (+, *, ?) are not greedy by default (i.e. they try to match the minimum number of characters instead of the maximum)
|
||
|
preNoAutoCapture // (group) is a non-capturing group; only named groups capture
|
||
|
);
|
||
|
|
||
|
type
|
||
|
TPerlRegExState = set of (
|
||
|
preNotBOL, // Not Beginning Of Line: ^ does not match at the start of Subject
|
||
|
preNotEOL, // Not End Of Line: $ does not match at the end of Subject
|
||
|
preNotEmpty // Empty matches not allowed
|
||
|
);
|
||
|
|
||
|
const
|
||
|
// Maximum number of subexpressions (backreferences)
|
||
|
// Subexpressions are created by placing round brackets in the regex, and are referenced by \1, \2, ...
|
||
|
// In Perl, they are available as $1, $2, ... after the regex matched; with TPerlRegEx, use the Subexpressions property
|
||
|
// You can also insert \1, \2, ... in the Replacement PCREString; \0 is the complete matched expression
|
||
|
MAX_SUBEXPRESSIONS = 99;
|
||
|
|
||
|
{$IFDEF UNICODE}
|
||
|
// All implicit string casts have been verified to be correct
|
||
|
{$WARN IMPLICIT_STRING_CAST OFF}
|
||
|
// Use UTF-8 in Delphi 2009 and later, so Unicode strings are handled correctly.
|
||
|
// PCRE does not support UTF-16
|
||
|
type
|
||
|
PCREString = UTF8String;
|
||
|
{$ELSE UNICODE}
|
||
|
// Use AnsiString in Delphi 2007 and earlier
|
||
|
type
|
||
|
PCREString = AnsiString;
|
||
|
{$ENDIF UNICODE}
|
||
|
|
||
|
type
|
||
|
TPerlRegExReplaceEvent = procedure(Sender: TObject; var ReplaceWith: PCREString) of object;
|
||
|
|
||
|
type
|
||
|
TPerlRegEx = class(TComponent)
|
||
|
private // *** Property storage, getters and setters
|
||
|
FCompiled, FStudied: Boolean;
|
||
|
FOptions: TPerlRegExOptions;
|
||
|
FState: TPerlRegExState;
|
||
|
FRegEx, FReplacement, FSubject: PCREString;
|
||
|
FStart, FStop: Integer;
|
||
|
FOnMatch: TNotifyEvent;
|
||
|
FOnReplace: TPerlRegExReplaceEvent;
|
||
|
function GetMatchedExpression: PCREString;
|
||
|
function GetMatchedExpressionLength: Integer;
|
||
|
function GetMatchedExpressionOffset: Integer;
|
||
|
procedure SetOptions(Value: TPerlRegExOptions);
|
||
|
procedure SetRegEx(const Value: PCREString);
|
||
|
function GetSubExpressionCount: Integer;
|
||
|
function GetSubExpressions(Index: Integer): PCREString;
|
||
|
function GetSubExpressionLengths(Index: Integer): Integer;
|
||
|
function GetSubExpressionOffsets(Index: Integer): Integer;
|
||
|
procedure SetSubject(const Value: PCREString);
|
||
|
procedure SetStart(const Value: Integer);
|
||
|
procedure SetStop(const Value: Integer);
|
||
|
function GetFoundMatch: Boolean;
|
||
|
private // *** Variables used by pcrelib.dll
|
||
|
Offsets: array[0..(MAX_SUBEXPRESSIONS+1)*3] of Integer;
|
||
|
OffsetCount: Integer;
|
||
|
pcreOptions: Integer;
|
||
|
pattern, hints, chartable: Pointer;
|
||
|
FSubjectPChar: PAnsiChar;
|
||
|
FHasStoredSubExpressions: Boolean;
|
||
|
FStoredSubExpressions: array of PCREString;
|
||
|
function GetSubjectLeft: PCREString;
|
||
|
function GetSubjectRight: PCREString;
|
||
|
protected
|
||
|
procedure CleanUp;
|
||
|
// Dispose off whatever we created, so we can start over. Called automatically when needed, so it is not made public
|
||
|
procedure ClearStoredSubExpressions;
|
||
|
public
|
||
|
constructor Create(AOwner: TComponent); override;
|
||
|
// Come to life
|
||
|
destructor Destroy; override;
|
||
|
// Clean up after ourselves
|
||
|
class function EscapeRegExChars(const S: string): string;
|
||
|
// Escapes regex characters in S so that the regex engine can be used to match S as plain text
|
||
|
procedure Compile;
|
||
|
// Compile the regex. Called automatically by Match
|
||
|
procedure Study;
|
||
|
// Study the regex. Studying takes time, but will make the execution of the regex a lot faster.
|
||
|
// Call study if you will be using the same regex many times
|
||
|
function Match: Boolean;
|
||
|
// Attempt to match the regex
|
||
|
function MatchAgain: Boolean;
|
||
|
// Attempt to match the regex to the remainder of the string after the previous match
|
||
|
// To avoid problems (when using ^ in the regex), call MatchAgain only after a succesful Match()
|
||
|
function Replace: PCREString;
|
||
|
// Replace matched expression in Subject with ComputeReplacement. Returns the actual replacement text from ComputeReplacement
|
||
|
function ReplaceAll: Boolean;
|
||
|
// Repeat MatchAgain and Replace until you drop. Returns True if anything was replaced at all.
|
||
|
function ComputeReplacement: PCREString;
|
||
|
// Returns Replacement with backreferences filled in
|
||
|
procedure StoreSubExpressions;
|
||
|
// Stores duplicates of SubExpressions[] so they and ComputeReplacement will still return the proper strings
|
||
|
// even if FSubject is changed or cleared
|
||
|
function NamedSubExpression(const SEName: PCREString): Integer;
|
||
|
// Returns the index of the named group SEName
|
||
|
procedure Split(Strings: TStrings; Limit: Integer);
|
||
|
// Split Subject along regex matches. Items are appended to PCREStrings.
|
||
|
property Compiled: Boolean read FCompiled;
|
||
|
// True if the RegEx has already been compiled.
|
||
|
property FoundMatch: Boolean read GetFoundMatch;
|
||
|
// Returns True when MatchedExpression* and SubExpression* indicate a match
|
||
|
property Studied: Boolean read FStudied;
|
||
|
// True if the RegEx has already been studied
|
||
|
property MatchedExpression: PCREString read GetMatchedExpression;
|
||
|
// The matched PCREString
|
||
|
property MatchedExpressionLength: Integer read GetMatchedExpressionLength;
|
||
|
// Length of the matched PCREString
|
||
|
property MatchedExpressionOffset: Integer read GetMatchedExpressionOffset;
|
||
|
// Character offset in the Subject PCREString at which the matched subPCREString starts
|
||
|
property Start: Integer read FStart write SetStart;
|
||
|
// Starting position in Subject from which MatchAgain begins
|
||
|
property Stop: Integer read FStop write SetStop;
|
||
|
// Last character in Subject that Match and MatchAgain search through
|
||
|
property State: TPerlRegExState read FState write FState;
|
||
|
// State of Subject
|
||
|
property SubExpressionCount: Integer read GetSubExpressionCount;
|
||
|
// Number of matched subexpressions
|
||
|
property SubExpressions[Index: Integer]: PCREString read GetSubExpressions;
|
||
|
// Matched subexpressions after a regex has been matched
|
||
|
property SubExpressionLengths[Index: Integer]: Integer read GetSubExpressionLengths;
|
||
|
// Lengths of the subexpressions
|
||
|
property SubExpressionOffsets[Index: Integer]: Integer read GetSubExpressionOffsets;
|
||
|
// Character offsets in the Subject PCREString of the subexpressions
|
||
|
property Subject: PCREString read FSubject write SetSubject;
|
||
|
// The PCREString on which Match() will try to match RegEx
|
||
|
property SubjectLeft: PCREString read GetSubjectLeft;
|
||
|
// Part of the subject to the left of the match
|
||
|
property SubjectRight: PCREString read GetSubjectRight;
|
||
|
// Part of the subject to the right of the match
|
||
|
published
|
||
|
property Options: TPerlRegExOptions read FOptions write SetOptions;
|
||
|
// Options
|
||
|
property RegEx: PCREString read FRegEx write SetRegEx;
|
||
|
// The regular expression to be matched
|
||
|
property Replacement: PCREString read FReplacement write FReplacement;
|
||
|
// PCREString to replace matched expression with. \number backreferences will be substituted with SubExpressions
|
||
|
// TPerlRegEx supports the "JGsoft" replacement text flavor as explained at http://www.regular-expressions.info/refreplace.html
|
||
|
property OnMatch: TNotifyEvent read FOnMatch write FOnMatch;
|
||
|
// Triggered by Match and MatchAgain after a successful match
|
||
|
property OnReplace: TPerlRegExReplaceEvent read FOnReplace write FOnReplace;
|
||
|
// Triggered by Replace and ReplaceAll just before the replacement is done, allowing you to determine the new PCREString
|
||
|
end;
|
||
|
|
||
|
{
|
||
|
You can add TPerlRegEx components to a TPerlRegExList to match them all together on the same subject,
|
||
|
as if they were one regex regex1|regex2|regex3|...
|
||
|
TPerlRegExList does not own the TPerlRegEx components, just like a TList
|
||
|
If a TPerlRegEx has been added to a TPerlRegExList, it should not be used in any other situation
|
||
|
until it is removed from the list
|
||
|
}
|
||
|
|
||
|
type
|
||
|
TPerlRegExList = class
|
||
|
private
|
||
|
FList: TList;
|
||
|
FSubject: PCREString;
|
||
|
FMatchedRegEx: TPerlRegEx;
|
||
|
FStart, FStop: Integer;
|
||
|
function GetRegEx(Index: Integer): TPerlRegEx;
|
||
|
procedure SetRegEx(Index: Integer; Value: TPerlRegEx);
|
||
|
procedure SetSubject(const Value: PCREString);
|
||
|
procedure SetStart(const Value: Integer);
|
||
|
procedure SetStop(const Value: Integer);
|
||
|
function GetCount: Integer;
|
||
|
protected
|
||
|
procedure UpdateRegEx(ARegEx: TPerlRegEx);
|
||
|
public
|
||
|
constructor Create;
|
||
|
destructor Destroy; override;
|
||
|
public
|
||
|
function Add(ARegEx: TPerlRegEx): Integer;
|
||
|
procedure Clear;
|
||
|
procedure Delete(Index: Integer);
|
||
|
function IndexOf(ARegEx: TPerlRegEx): Integer;
|
||
|
procedure Insert(Index: Integer; ARegEx: TPerlRegEx);
|
||
|
public
|
||
|
function Match: Boolean;
|
||
|
function MatchAgain: Boolean;
|
||
|
property RegEx[Index: Integer]: TPerlRegEx read GetRegEx write SetRegEx;
|
||
|
property Count: Integer read GetCount;
|
||
|
property Subject: PCREString read FSubject write SetSubject;
|
||
|
property Start: Integer read FStart write SetStart;
|
||
|
property Stop: Integer read FStop write SetStop;
|
||
|
property MatchedRegEx: TPerlRegEx read FMatchedRegEx;
|
||
|
end;
|
||
|
|
||
|
procedure Register;
|
||
|
|
||
|
implementation
|
||
|
|
||
|
|
||
|
{ ********* Unit support routines ********* }
|
||
|
|
||
|
procedure Register;
|
||
|
begin
|
||
|
RegisterComponents('JGsoft', [TPerlRegEx]);
|
||
|
end;
|
||
|
|
||
|
function FirstCap(const S: string): string;
|
||
|
begin
|
||
|
if S = '' then Result := ''
|
||
|
else begin
|
||
|
Result := AnsiLowerCase(S);
|
||
|
{$IFDEF UNICODE}
|
||
|
CharUpperBuffW(@Result[1], 1);
|
||
|
{$ELSE}
|
||
|
CharUpperBuffA(@Result[1], 1);
|
||
|
{$ENDIF}
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
function InitialCaps(const S: string): string;
|
||
|
var
|
||
|
I: Integer;
|
||
|
Up: Boolean;
|
||
|
begin
|
||
|
Result := AnsiLowerCase(S);
|
||
|
Up := True;
|
||
|
{$IFDEF UNICODE}
|
||
|
for I := 1 to Length(Result) do begin
|
||
|
case Result[I] of
|
||
|
#0..'&', '(', '*', '+', ',', '-', '.', '?', '<', '[', '{', #$00B7:
|
||
|
Up := True
|
||
|
else
|
||
|
if Up and (Result[I] <> '''') then begin
|
||
|
CharUpperBuffW(@Result[I], 1);
|
||
|
Up := False
|
||
|
end
|
||
|
end;
|
||
|
end;
|
||
|
{$ELSE UNICODE}
|
||
|
if SysLocale.FarEast then begin
|
||
|
I := 1;
|
||
|
while I <= Length(Result) do begin
|
||
|
if Result[I] in LeadBytes then begin
|
||
|
Inc(I, 2)
|
||
|
end
|
||
|
else begin
|
||
|
if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{'] then Up := True
|
||
|
else if Up and (Result[I] <> '''') then begin
|
||
|
CharUpperBuffA(@Result[I], 1);
|
||
|
Result[I] := UpperCase(Result[I])[1];
|
||
|
Up := False
|
||
|
end;
|
||
|
Inc(I)
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
else
|
||
|
for I := 1 to Length(Result) do begin
|
||
|
if Result[I] in [#0..'&', '('..'.', '?', '<', '[', '{', #$B7] then Up := True
|
||
|
else if Up and (Result[I] <> '''') then begin
|
||
|
CharUpperBuffA(@Result[I], 1);
|
||
|
Result[I] := AnsiUpperCase(Result[I])[1];
|
||
|
Up := False
|
||
|
end
|
||
|
end;
|
||
|
{$ENDIF UNICODE}
|
||
|
end;
|
||
|
|
||
|
|
||
|
{ ********* TPerlRegEx component ********* }
|
||
|
|
||
|
procedure TPerlRegEx.CleanUp;
|
||
|
begin
|
||
|
FCompiled := False; FStudied := False;
|
||
|
pcre_dispose(pattern, hints, nil);
|
||
|
pattern := nil;
|
||
|
hints := nil;
|
||
|
ClearStoredSubExpressions;
|
||
|
OffsetCount := 0;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.ClearStoredSubExpressions;
|
||
|
begin
|
||
|
FHasStoredSubExpressions := False;
|
||
|
FStoredSubExpressions := nil;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.Compile;
|
||
|
var
|
||
|
Error: PAnsiChar;
|
||
|
ErrorOffset: Integer;
|
||
|
begin
|
||
|
if FRegEx = '' then
|
||
|
raise Exception.Create('TPerlRegEx.Compile() - Please specify a regular expression in RegEx first');
|
||
|
CleanUp;
|
||
|
Pattern := pcre_compile(PAnsiChar(FRegEx), pcreOptions, @Error, @ErrorOffset, chartable);
|
||
|
if Pattern = nil then
|
||
|
raise Exception.Create(Format('TPerlRegEx.Compile() - Error in regex at offset %d: %s', [ErrorOffset, AnsiString(Error)]));
|
||
|
FCompiled := True
|
||
|
end;
|
||
|
|
||
|
(* Backreference overview:
|
||
|
|
||
|
Assume there are 13 backreferences:
|
||
|
|
||
|
Text TPerlRegex .NET Java ECMAScript
|
||
|
$17 $1 + "7" "$17" $1 + "7" $1 + "7"
|
||
|
$017 $1 + "7" "$017" $1 + "7" $1 + "7"
|
||
|
$12 $12 $12 $12 $12
|
||
|
$012 $1 + "2" $12 $12 $1 + "2"
|
||
|
${1}2 $1 + "2" $1 + "2" error "${1}2"
|
||
|
$$ "$" "$" error "$"
|
||
|
\$ "$" "\$" "$" "\$"
|
||
|
*)
|
||
|
|
||
|
function TPerlRegEx.ComputeReplacement: PCREString;
|
||
|
var
|
||
|
Mode: AnsiChar;
|
||
|
S: PCREString;
|
||
|
I, J, N: Integer;
|
||
|
|
||
|
procedure ReplaceBackreference(Number: Integer);
|
||
|
var
|
||
|
Backreference: PCREString;
|
||
|
begin
|
||
|
Delete(S, I, J-I);
|
||
|
if Number <= SubExpressionCount then begin
|
||
|
Backreference := SubExpressions[Number];
|
||
|
if Backreference <> '' then begin
|
||
|
// Ignore warnings; converting to UTF-8 does not cause data loss
|
||
|
case Mode of
|
||
|
'L', 'l': Backreference := AnsiLowerCase(Backreference);
|
||
|
'U', 'u': Backreference := AnsiUpperCase(Backreference);
|
||
|
'F', 'f': Backreference := FirstCap(Backreference);
|
||
|
'I', 'i': Backreference := InitialCaps(Backreference);
|
||
|
end;
|
||
|
if S <> '' then begin
|
||
|
Insert(Backreference, S, I);
|
||
|
I := I + Length(Backreference);
|
||
|
end
|
||
|
else begin
|
||
|
S := Backreference;
|
||
|
I := MaxInt;
|
||
|
end
|
||
|
end;
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
procedure ProcessBackreference(NumberOnly, Dollar: Boolean);
|
||
|
var
|
||
|
Number, Number2: Integer;
|
||
|
Group: PCREString;
|
||
|
begin
|
||
|
Number := -1;
|
||
|
if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
|
||
|
// Get the number of the backreference
|
||
|
Number := Ord(S[J]) - Ord('0');
|
||
|
Inc(J);
|
||
|
if (J <= Length(S)) and (S[J] in ['0'..'9']) then begin
|
||
|
// Expand it to two digits only if that would lead to a valid backreference
|
||
|
Number2 := Number*10 + Ord(S[J]) - Ord('0');
|
||
|
if Number2 <= SubExpressionCount then begin
|
||
|
Number := Number2;
|
||
|
Inc(J)
|
||
|
end;
|
||
|
end;
|
||
|
end
|
||
|
else if not NumberOnly then begin
|
||
|
if Dollar and (J < Length(S)) and (S[J] = '{') then begin
|
||
|
// Number or name in curly braces
|
||
|
Inc(J);
|
||
|
case S[J] of
|
||
|
'0'..'9': begin
|
||
|
Number := Ord(S[J]) - Ord('0');
|
||
|
Inc(J);
|
||
|
while (J <= Length(S)) and (S[J] in ['0'..'9']) do begin
|
||
|
Number := Number*10 + Ord(S[J]) - Ord('0');
|
||
|
Inc(J)
|
||
|
end;
|
||
|
end;
|
||
|
'A'..'Z', 'a'..'z', '_': begin
|
||
|
Inc(J);
|
||
|
while (J <= Length(S)) and (S[J] in ['A'..'Z', 'a'..'z', '0'..'9', '_']) do Inc(J);
|
||
|
if (J <= Length(S)) and (S[J] = '}') then begin
|
||
|
Group := Copy(S, I+2, J-I-2);
|
||
|
Number := NamedSubExpression(Group);
|
||
|
end
|
||
|
end;
|
||
|
end;
|
||
|
if (J > Length(S)) or (S[J] <> '}') then Number := -1
|
||
|
else Inc(J)
|
||
|
end
|
||
|
else if Dollar and (S[J] = '_') then begin
|
||
|
// $_ (whole subject)
|
||
|
Delete(S, I, J+1-I);
|
||
|
Insert(Subject, S, I);
|
||
|
I := I + Length(Subject);
|
||
|
Exit;
|
||
|
end
|
||
|
else case S[J] of
|
||
|
'&': begin
|
||
|
// \& or $& (whole regex match)
|
||
|
Number := 0;
|
||
|
Inc(J);
|
||
|
end;
|
||
|
'+': begin
|
||
|
// \+ or $+ (highest-numbered participating group)
|
||
|
Number := SubExpressionCount;
|
||
|
Inc(J);
|
||
|
end;
|
||
|
'`': begin
|
||
|
// \` or $` (backtick; subject to the left of the match)
|
||
|
Delete(S, I, J+1-I);
|
||
|
Insert(SubjectLeft, S, I);
|
||
|
I := I + Offsets[0] - 1;
|
||
|
Exit;
|
||
|
end;
|
||
|
'''': begin
|
||
|
// \' or $' (straight quote; subject to the right of the match)
|
||
|
Delete(S, I, J+1-I);
|
||
|
Insert(SubjectRight, S, I);
|
||
|
I := I + Length(Subject) - Offsets[1];
|
||
|
Exit;
|
||
|
end
|
||
|
end;
|
||
|
end;
|
||
|
if Number >= 0 then ReplaceBackreference(Number)
|
||
|
else Inc(I)
|
||
|
end;
|
||
|
|
||
|
begin
|
||
|
S := FReplacement;
|
||
|
I := 1;
|
||
|
while I < Length(S) do begin
|
||
|
case S[I] of
|
||
|
'\': begin
|
||
|
J := I + 1;
|
||
|
Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
|
||
|
case S[J] of
|
||
|
'$', '\': begin
|
||
|
Delete(S, I, 1);
|
||
|
Inc(I);
|
||
|
end;
|
||
|
'g': begin
|
||
|
if (J < Length(S)-1) and (S[J+1] = '<') and (S[J+2] in ['A'..'Z', 'a'..'z', '_']) then begin
|
||
|
// Python-style named group reference \g<name>
|
||
|
J := J+3;
|
||
|
while (J <= Length(S)) and (S[J] in ['0'..'9', 'A'..'Z', 'a'..'z', '_']) do Inc(J);
|
||
|
if (J <= Length(S)) and (S[J] = '>') then begin
|
||
|
N := NamedSubExpression(Copy(S, I+3, J-I-3));
|
||
|
Inc(J);
|
||
|
Mode := #0;
|
||
|
if N > 0 then ReplaceBackreference(N)
|
||
|
else Delete(S, I, J-I)
|
||
|
end
|
||
|
else I := J
|
||
|
end
|
||
|
else I := I+2;
|
||
|
end;
|
||
|
'l', 'L', 'u', 'U', 'f', 'F', 'i', 'I': begin
|
||
|
Mode := S[J];
|
||
|
Inc(J);
|
||
|
ProcessBackreference(True, False);
|
||
|
end;
|
||
|
else begin
|
||
|
Mode := #0;
|
||
|
ProcessBackreference(False, False);
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
'$': begin
|
||
|
J := I + 1;
|
||
|
Assert(J <= Length(S), 'CHECK: We let I stop one character before the end, so J cannot point beyond the end of the PCREString here');
|
||
|
if S[J] = '$' then begin
|
||
|
Delete(S, J, 1);
|
||
|
Inc(I);
|
||
|
end
|
||
|
else begin
|
||
|
Mode := #0;
|
||
|
ProcessBackreference(False, True);
|
||
|
end
|
||
|
end;
|
||
|
else Inc(I)
|
||
|
end
|
||
|
end;
|
||
|
Result := S
|
||
|
end;
|
||
|
|
||
|
constructor TPerlRegEx.Create(AOwner: TComponent);
|
||
|
begin
|
||
|
inherited Create(AOwner);
|
||
|
FState := [preNotEmpty];
|
||
|
chartable := pcre_maketables;
|
||
|
{$IFDEF UNICODE}
|
||
|
pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
|
||
|
{$ELSE}
|
||
|
pcreOptions := PCRE_NEWLINE_ANY;
|
||
|
{$ENDIF}
|
||
|
end;
|
||
|
|
||
|
destructor TPerlRegEx.Destroy;
|
||
|
begin
|
||
|
pcre_dispose(pattern, hints, chartable);
|
||
|
inherited Destroy;
|
||
|
end;
|
||
|
|
||
|
class function TPerlRegEx.EscapeRegExChars(const S: string): string;
|
||
|
var
|
||
|
I: Integer;
|
||
|
begin
|
||
|
Result := S;
|
||
|
I := Length(Result);
|
||
|
while I > 0 do begin
|
||
|
case Result[I] of
|
||
|
'.', '[', ']', '(', ')', '?', '*', '+', '{', '}', '^', '$', '|', '\':
|
||
|
Insert('\', Result, I);
|
||
|
#0: begin
|
||
|
Result[I] := '0';
|
||
|
Insert('\', Result, I);
|
||
|
end;
|
||
|
end;
|
||
|
Dec(I);
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetFoundMatch: Boolean;
|
||
|
begin
|
||
|
Result := OffsetCount > 0;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetMatchedExpression: PCREString;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Result := GetSubExpressions(0);
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetMatchedExpressionLength: Integer;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Result := GetSubExpressionLengths(0)
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetMatchedExpressionOffset: Integer;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Result := GetSubExpressionOffsets(0)
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubExpressionCount: Integer;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Result := OffsetCount-1
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubExpressionLengths(Index: Integer): Integer;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount');
|
||
|
Result := Offsets[Index*2+1]-Offsets[Index*2]
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubExpressionOffsets(Index: Integer): Integer;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount');
|
||
|
Result := Offsets[Index*2]
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubExpressions(Index: Integer): PCREString;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
if Index > SubExpressionCount then Result := ''
|
||
|
else if FHasStoredSubExpressions then Result := FStoredSubExpressions[Index]
|
||
|
else Result := Copy(FSubject, Offsets[Index*2], Offsets[Index*2+1]-Offsets[Index*2]);
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubjectLeft: PCREString;
|
||
|
begin
|
||
|
Result := Copy(Subject, 1, Offsets[0]-1);
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.GetSubjectRight: PCREString;
|
||
|
begin
|
||
|
Result := Copy(Subject, Offsets[1], MaxInt);
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.Match: Boolean;
|
||
|
var
|
||
|
I, Opts: Integer;
|
||
|
begin
|
||
|
ClearStoredSubExpressions;
|
||
|
if not Compiled then Compile;
|
||
|
if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
|
||
|
if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
|
||
|
if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
|
||
|
OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, 0, Opts, @Offsets[0], High(Offsets));
|
||
|
Result := OffsetCount > 0;
|
||
|
// Convert offsets into PCREString indices
|
||
|
if Result then begin
|
||
|
for I := 0 to OffsetCount*2-1 do
|
||
|
Inc(Offsets[I]);
|
||
|
FStart := Offsets[1];
|
||
|
if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
|
||
|
if Assigned(OnMatch) then OnMatch(Self)
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.MatchAgain: Boolean;
|
||
|
var
|
||
|
I, Opts: Integer;
|
||
|
begin
|
||
|
ClearStoredSubExpressions;
|
||
|
if not Compiled then Compile;
|
||
|
if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
|
||
|
if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
|
||
|
if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
|
||
|
if FStart-1 > FStop then OffsetCount := -1
|
||
|
else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, FStart-1, Opts, @Offsets[0], High(Offsets));
|
||
|
Result := OffsetCount > 0;
|
||
|
// Convert offsets into PCREString indices
|
||
|
if Result then begin
|
||
|
for I := 0 to OffsetCount*2-1 do
|
||
|
Inc(Offsets[I]);
|
||
|
FStart := Offsets[1];
|
||
|
if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
|
||
|
if Assigned(OnMatch) then OnMatch(Self)
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.NamedSubExpression(const SEName: PCREString): Integer;
|
||
|
begin
|
||
|
Result := pcre_get_stringnumber(Pattern, PAnsiChar(SEName));
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.Replace: PCREString;
|
||
|
begin
|
||
|
Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
|
||
|
// Substitute backreferences
|
||
|
Result := ComputeReplacement;
|
||
|
// Allow for just-in-time substitution determination
|
||
|
if Assigned(OnReplace) then OnReplace(Self, Result);
|
||
|
// Perform substitution
|
||
|
Delete(FSubject, MatchedExpressionOffset, MatchedExpressionLength);
|
||
|
if Result <> '' then Insert(Result, FSubject, MatchedExpressionOffset);
|
||
|
FSubjectPChar := PAnsiChar(FSubject);
|
||
|
// Position to continue search
|
||
|
FStart := FStart - MatchedExpressionLength + Length(Result);
|
||
|
FStop := FStop - MatchedExpressionLength + Length(Result);
|
||
|
// Replacement no longer matches regex, we assume
|
||
|
ClearStoredSubExpressions;
|
||
|
OffsetCount := 0;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegEx.ReplaceAll: Boolean;
|
||
|
begin
|
||
|
if Match then begin
|
||
|
Result := True;
|
||
|
repeat
|
||
|
Replace
|
||
|
until not MatchAgain;
|
||
|
end
|
||
|
else Result := False;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.SetOptions(Value: TPerlRegExOptions);
|
||
|
begin
|
||
|
if (FOptions <> Value) then begin
|
||
|
FOptions := Value;
|
||
|
{$IFDEF UNICODE}
|
||
|
pcreOptions := PCRE_UTF8 or PCRE_NEWLINE_ANY;
|
||
|
{$ELSE}
|
||
|
pcreOptions := PCRE_NEWLINE_ANY;
|
||
|
{$ENDIF}
|
||
|
if (preCaseLess in Value) then pcreOptions := pcreOptions or PCRE_CASELESS;
|
||
|
if (preMultiLine in Value) then pcreOptions := pcreOptions or PCRE_MULTILINE;
|
||
|
if (preSingleLine in Value) then pcreOptions := pcreOptions or PCRE_DOTALL;
|
||
|
if (preExtended in Value) then pcreOptions := pcreOptions or PCRE_EXTENDED;
|
||
|
if (preAnchored in Value) then pcreOptions := pcreOptions or PCRE_ANCHORED;
|
||
|
if (preUnGreedy in Value) then pcreOptions := pcreOptions or PCRE_UNGREEDY;
|
||
|
if (preNoAutoCapture in Value) then pcreOptions := pcreOptions or PCRE_NO_AUTO_CAPTURE;
|
||
|
CleanUp
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.SetRegEx(const Value: PCREString);
|
||
|
begin
|
||
|
if FRegEx <> Value then begin
|
||
|
FRegEx := Value;
|
||
|
CleanUp
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.SetStart(const Value: Integer);
|
||
|
begin
|
||
|
if Value < 1 then FStart := 1
|
||
|
else FStart := Value;
|
||
|
// If FStart > Length(Subject), MatchAgain() will simply return False
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.SetStop(const Value: Integer);
|
||
|
begin
|
||
|
if Value > Length(Subject) then FStop := Length(Subject)
|
||
|
else FStop := Value;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.SetSubject(const Value: PCREString);
|
||
|
begin
|
||
|
FSubject := Value;
|
||
|
FSubjectPChar := PAnsiChar(Value);
|
||
|
FStart := 1;
|
||
|
FStop := Length(Subject);
|
||
|
if not FHasStoredSubExpressions then OffsetCount := 0;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.Split(Strings: TStrings; Limit: Integer);
|
||
|
var
|
||
|
Offset, Count: Integer;
|
||
|
begin
|
||
|
Assert(Strings <> nil, 'REQUIRE: Strings');
|
||
|
if (Limit = 1) or not Match then Strings.Add(Subject)
|
||
|
else begin
|
||
|
Offset := 1;
|
||
|
Count := 1;
|
||
|
repeat
|
||
|
Strings.Add(Copy(Subject, Offset, MatchedExpressionOffset - Offset));
|
||
|
Inc(Count);
|
||
|
Offset := MatchedExpressionOffset + MatchedExpressionLength;
|
||
|
until ((Limit > 1) and (Count >= Limit)) or not MatchAgain;
|
||
|
Strings.Add(Copy(Subject, Offset, MaxInt));
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.StoreSubExpressions;
|
||
|
var
|
||
|
I: Integer;
|
||
|
begin
|
||
|
if OffsetCount > 0 then begin
|
||
|
ClearStoredSubExpressions;
|
||
|
SetLength(FStoredSubExpressions, SubExpressionCount+1);
|
||
|
for I := SubExpressionCount downto 0 do
|
||
|
FStoredSubExpressions[I] := SubExpressions[I];
|
||
|
FHasStoredSubExpressions := True;
|
||
|
end
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegEx.Study;
|
||
|
var
|
||
|
Error: PAnsiChar;
|
||
|
begin
|
||
|
if not FCompiled then Compile;
|
||
|
Hints := pcre_study(Pattern, 0, @Error);
|
||
|
if Error <> nil then
|
||
|
raise Exception.Create('TPerlRegEx.Study() - Error studying the regex: ' + AnsiString(Error));
|
||
|
FStudied := True
|
||
|
end;
|
||
|
|
||
|
{ TPerlRegExList }
|
||
|
|
||
|
function TPerlRegExList.Add(ARegEx: TPerlRegEx): Integer;
|
||
|
begin
|
||
|
Result := FList.Add(ARegEx);
|
||
|
UpdateRegEx(ARegEx);
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.Clear;
|
||
|
begin
|
||
|
FList.Clear;
|
||
|
end;
|
||
|
|
||
|
constructor TPerlRegExList.Create;
|
||
|
begin
|
||
|
inherited Create;
|
||
|
FList := TList.Create;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.Delete(Index: Integer);
|
||
|
begin
|
||
|
FList.Delete(Index);
|
||
|
end;
|
||
|
|
||
|
destructor TPerlRegExList.Destroy;
|
||
|
begin
|
||
|
FList.Free;
|
||
|
inherited
|
||
|
end;
|
||
|
|
||
|
function TPerlRegExList.GetCount: Integer;
|
||
|
begin
|
||
|
Result := FList.Count;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegExList.GetRegEx(Index: Integer): TPerlRegEx;
|
||
|
begin
|
||
|
Result := TPerlRegEx(Pointer(FList[Index]));
|
||
|
end;
|
||
|
|
||
|
function TPerlRegExList.IndexOf(ARegEx: TPerlRegEx): Integer;
|
||
|
begin
|
||
|
Result := FList.IndexOf(ARegEx);
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.Insert(Index: Integer; ARegEx: TPerlRegEx);
|
||
|
begin
|
||
|
FList.Insert(Index, ARegEx);
|
||
|
UpdateRegEx(ARegEx);
|
||
|
end;
|
||
|
|
||
|
function TPerlRegExList.Match: Boolean;
|
||
|
begin
|
||
|
SetStart(1);
|
||
|
FMatchedRegEx := nil;
|
||
|
Result := MatchAgain;
|
||
|
end;
|
||
|
|
||
|
function TPerlRegExList.MatchAgain: Boolean;
|
||
|
var
|
||
|
I, MatchStart, MatchPos: Integer;
|
||
|
ARegEx: TPerlRegEx;
|
||
|
begin
|
||
|
if FMatchedRegEx <> nil then
|
||
|
MatchStart := FMatchedRegEx.MatchedExpressionOffset + FMatchedRegEx.MatchedExpressionLength
|
||
|
else
|
||
|
MatchStart := FStart;
|
||
|
FMatchedRegEx := nil;
|
||
|
MatchPos := MaxInt;
|
||
|
for I := 0 to Count-1 do begin
|
||
|
ARegEx := RegEx[I];
|
||
|
if (not ARegEx.FoundMatch) or (ARegEx.MatchedExpressionOffset < MatchStart) then begin
|
||
|
ARegEx.Start := MatchStart;
|
||
|
ARegEx.MatchAgain;
|
||
|
end;
|
||
|
if ARegEx.FoundMatch and (ARegEx.MatchedExpressionOffset < MatchPos) then begin
|
||
|
MatchPos := ARegEx.MatchedExpressionOffset;
|
||
|
FMatchedRegEx := ARegEx;
|
||
|
end;
|
||
|
if MatchPos = MatchStart then Break;
|
||
|
end;
|
||
|
Result := MatchPos < MaxInt;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.SetRegEx(Index: Integer; Value: TPerlRegEx);
|
||
|
begin
|
||
|
FList[Index] := Value;
|
||
|
UpdateRegEx(Value);
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.SetStart(const Value: Integer);
|
||
|
var
|
||
|
I: Integer;
|
||
|
begin
|
||
|
if FStart <> Value then begin
|
||
|
FStart := Value;
|
||
|
for I := Count-1 downto 0 do
|
||
|
RegEx[I].Start := Value;
|
||
|
FMatchedRegEx := nil;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.SetStop(const Value: Integer);
|
||
|
var
|
||
|
I: Integer;
|
||
|
begin
|
||
|
if FStop <> Value then begin
|
||
|
FStop := Value;
|
||
|
for I := Count-1 downto 0 do
|
||
|
RegEx[I].Stop := Value;
|
||
|
FMatchedRegEx := nil;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.SetSubject(const Value: PCREString);
|
||
|
var
|
||
|
I: Integer;
|
||
|
begin
|
||
|
if FSubject <> Value then begin
|
||
|
FSubject := Value;
|
||
|
for I := Count-1 downto 0 do
|
||
|
RegEx[I].Subject := Value;
|
||
|
FMatchedRegEx := nil;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
procedure TPerlRegExList.UpdateRegEx(ARegEx: TPerlRegEx);
|
||
|
begin
|
||
|
ARegEx.Subject := FSubject;
|
||
|
ARegEx.Start := FStart;
|
||
|
end;
|
||
|
|
||
|
end.
|