apophysis7x/System/sdStringTable.pas

634 lines
15 KiB
ObjectPascal
Raw Normal View History

{ unit sdStringTable
Author: Nils Haeck M.Sc. (n.haeck@simdesign.nl)
Original Date: 28 May 2007
Version: 1.1
Copyright (c) 2007 - 2010 Simdesign BV
It is NOT allowed under ANY circumstances to publish or copy this code
without accepting the license conditions in accompanying LICENSE.txt
first!
This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF
ANY KIND, either express or implied.
Please visit http://www.simdesign.nl/xml.html for more information.
}
unit sdStringTable;
interface
{$i NativeXml.inc}
uses
Classes, SysUtils, Contnrs;
type
// A record describing a string by its first position and length (Count)
TsdStringRec = record
First: Pbyte;
Count: integer;
end;
// A string reference item used in string reference lists (do not use directly)
TsdRefString = class
private
FID: integer;
FFrequency: integer;
FFirst: Pbyte;
FCharCount: integer;
protected
procedure SetString(const SR: TsdStringRec);
function CompareToSR(const SR: TsdStringRec): integer;
function StringRec: TsdStringRec;
public
destructor Destroy; override;
function AsString: UTF8String;
property CharCount: integer read FCharCount;
property Frequency: integer read FFrequency;
end;
// A list of string reference items (do not use directly)
TsdRefStringList = class(TObjectList)
private
function GetItems(Index: integer): TsdRefString;
protected
// Assumes list is sorted by StringID
function IndexOfID(AID: integer; var Index: integer): boolean;
// Assumes list is sorted by string rec
function IndexOfSR(const AStringRec: TsdStringRec; var Index: integer): boolean;
public
property Items[Index: integer]: TsdRefString read GetItems; default;
end;
// A string table, holding a collection of unique strings, sorted in 2 ways
// for fast access. Strings can be added with AddString or AddStringRec,
// and should be updated with SetString. When a string is added or updated,
// an ID is returned which the application can use to retrieve the string,
// using GetString.
TsdStringTable = class(TPersistent)
private
FByID: TsdRefStringList;
FBySR: TsdRefStringList;
protected
procedure DecFrequency(AItem: TsdRefString; ByIdIndex: integer);
function NextUniqueID: integer;
public
constructor Create;
destructor Destroy; override;
procedure Clear;
// Add a new string rec, return fresh ID or ID of existing item, and increase
// the existing item's ref count
function AddStringRec(const SR: TsdStringRec): integer;
// Add a new string S to the table, the function returns its ID.
function AddString(const S: UTF8String): integer;
// Get the refstring by ID
function ById(index: integer): TsdRefString;
// Delete refstring by ID
procedure Delete(ByIdIndex: integer);
// determine if the stringrec exists
function ExistStringRec(const SR: TsdStringRec): boolean;
// Get the string of refstring with ID
function GetString(ID: integer): UTF8String;
// Set the string value of refstring with ID.
procedure SetString(var ID: integer; const S: UTF8String);
// Number of refstrings
function StringCount: integer;
procedure SaveToFile(const AFileName: string);
procedure SaveToStream(S: TStream);
end;
{utility functions}
// convert a string into a string rec
function sdStringToSR(const S: Utf8String): TsdStringRec;
// convert a string rec into a string
function sdSRToString(const SR: TsdStringRec): Utf8String;
// compare two string recs. This is NOT an alphabetic compare. SRs are first
// compared by length, then by first byte, then last byte then second, then
// N-1, until all bytes are compared.
function sdCompareSR(const SR1, SR2: TsdStringRec): integer;
// compare 2 bytes
function sdCompareByte(Byte1, Byte2: byte): integer;
// compare 2 integers
function sdCompareInteger(Int1, Int2: integer): integer;
function sdUtf16ToUtf8Mem(Src: Pword; Dst: Pbyte; Count: integer): integer;
function sdUtf8ToUtf16Mem(var Src: Pbyte; Dst: Pword; Count: integer): integer;
procedure sdStreamWrite(S: TStream; const AString: AnsiString);
procedure sdStreamWriteStringRec(S: TStream; const AStringRec: TsdStringRec);
procedure sdStreamWriteRefString(S: TStream; ARefString: TsdRefString);
implementation
{ TsdRefString }
function TsdRefString.AsString: UTF8String;
begin
Result := sdSRToString(StringRec);
end;
function TsdRefString.CompareToSR(const SR: TsdStringRec): integer;
begin
if SR.Count = 0 then
begin
// shortcut
Result := 1;
exit;
end;
Result := sdCompareSR(StringRec, SR);
end;
destructor TsdRefString.Destroy;
begin
FreeMem(FFirst);
inherited;
end;
procedure TsdRefString.SetString(const SR: TsdStringRec);
begin
FCharCount := SR.Count;
ReallocMem(FFirst, FCharCount);
Move(SR.First^, FFirst^, FCharCount);
end;
function TsdRefString.StringRec: TsdStringRec;
begin
Result.First := FFirst;
Result.Count := FCharCount;
end;
{ TsdRefStringList }
function TsdRefStringList.GetItems(Index: integer): TsdRefString;
begin
Result := Get(Index);
end;
function TsdRefStringList.IndexOfID(AID: integer; var Index: integer): boolean;
var
Min, Max: integer;
begin
Result := False;
// Find position - binary method
Index := 0;
Min := 0;
Max := Count;
while Min < Max do
begin
Index := (Min + Max) div 2;
case sdCompareInteger(Items[Index].FID, AID) of
-1: Min := Index + 1;
0: begin
Result := True;
exit;
end;
1: Max := Index;
end;
end;
Index := Min;
end;
function TsdRefStringList.IndexOfSR(const AStringRec: TsdStringRec; var Index: integer): boolean;
var
Min, Max: integer;
SR: TsdStringRec;
begin
Result := False;
// Find position - binary method
Index := 0;
Min := 0;
Max := Count;
while Min < Max do
begin
Index := (Min + Max) div 2;
SR := TsdRefString(Get(Index)).StringRec;
case sdCompareSR(SR, AStringRec) of
-1: Min := Index + 1;
0: begin
Result := True;
exit;
end;
1: Max := Index;
end;
end;
Index := Min;
end;
{ TsdStringTable }
function TsdStringTable.AddString(const S: UTF8String): integer;
var
SR: TsdStringRec;
begin
SR := sdStringToSR(S);
Result := AddStringRec(SR);
end;
function TsdStringTable.AddStringRec(const SR: TsdStringRec): integer;
var
BySRIndex: integer;
Item: TsdRefString;
NewSR: TsdStringRec;
Res: boolean;
begin
// zero-length string
if SR.Count = 0 then
begin
Result := 0;
exit;
end;
// Try to find the new string
if FBySR.IndexOfSR(SR, BySRIndex) then
begin
Item := FBySR.Items[BySRIndex];
inc(Item.FFrequency);
Result := Item.FID;
exit;
end;
// Not found.. must make new item
Item := TsdRefString.Create;
Item.SetString(SR);
NewSR := Item.StringRec;
Item.FID := NextUniqueID;
FById.Add(Item);
Item.FFrequency := 1;
// debug:
//SetLength(Item.FValue, Item.FCount);
//Move(Item.FirstPtr(FBase)^, Item.FValue[1], Item.FCount);
// Insert in BySR lists
Res := FBySR.IndexOfSR(NewSR, BySRIndex);
assert(Res = False);
FBySR.Insert(BySRIndex, Item);
Result := Item.FID;
end;
function TsdStringTable.ById(index: integer): TsdRefString;
begin
Result := FById[Index];
end;
procedure TsdStringTable.Clear;
begin
FByID.Clear;
FBySR.Clear;
end;
constructor TsdStringTable.Create;
begin
inherited Create;
FByID := TsdRefStringList.Create(False);
FBySR := TsdRefStringList.Create(True);
end;
procedure TsdStringTable.DecFrequency(AItem: TsdRefString; ByIdIndex: integer);
var
BySRIndex: integer;
Res: boolean;
begin
dec(AItem.FFrequency);
assert(AItem.FFrequency >= 0);
if AItem.FFrequency = 0 then
begin
// We must remove it
FById.Delete(ByIdIndex);
Res := FBySR.IndexOfSR(AItem.StringRec, BySRIndex);
assert(Res = True);
FBySR.Delete(BySRIndex);
end;
end;
procedure TsdStringTable.Delete(ByIdIndex: integer);
var
Item: TsdRefString;
BySRIndex: integer;
Res: boolean;
begin
Item := FById[ByIdIndex];
if Item = nil then
exit;
FById.Delete(ByIdIndex);
Res := FBySR.IndexOfSR(Item.StringRec, BySRIndex);
assert(Res = True);
FBySR.Delete(BySRIndex);
end;
destructor TsdStringTable.Destroy;
begin
FreeAndNil(FByID);
FreeAndNil(FBySR);
inherited;
end;
function TsdStringTable.ExistStringRec(const SR: TsdStringRec): boolean;
var
BySRIndex: integer;
begin
// zero-length string
if SR.Count = 0 then
begin
Result := False;
exit;
end;
// Try to find the new string
Result := FBySR.IndexOfSR(SR, BySRIndex);
end;
function TsdStringTable.GetString(ID: integer): UTF8String;
var
Index, Count: integer;
Item: TsdRefString;
begin
if ID = 0 then
begin
Result := '';
exit;
end;
// Find the ID
if FByID.IndexOfID(ID, Index) then
begin
Item := FById[Index];
Count := Item.FCharCount;
SetLength(Result, Count);
Move(Item.FFirst^, Result[1], Count);
exit;
end;
Result := '';
end;
function TsdStringTable.NextUniqueID: integer;
begin
if FById.Count = 0 then
Result := 1
else
Result := FByID[FByID.Count - 1].FID + 1;
end;
procedure TsdStringTable.SaveToFile(const AFileName: string);
var
F: TFileStream;
begin
F := TFileStream.Create(AFileName, fmCreate);
try
SaveToStream(F);
finally
F.Free;
end;
end;
procedure TsdStringTable.SaveToStream(S: TStream);
var
i: integer;
R: UTF8String;
begin
for i := 0 to FBySR.Count - 1 do
begin
R := FBySR[i].AsString + #13#10;
S.Write(R[1], length(R));
end;
end;
procedure TsdStringTable.SetString(var ID: integer; const S: UTF8String);
var
ByIdIndex: integer;
Item: TsdRefString;
SR: TsdStringRec;
begin
// Make temp string record
SR := sdStringtoSR(S);
// Do we have a ref string with this ID?
if (ID > 0) and FByID.IndexOfID(ID, ByIdIndex) then
begin
// Is the string still the same?
Item := FById[ByIdIndex];
if Item.CompareToSR(SR) = 0 then
exit;
// The string changed..
DecFrequency(Item, ByIdIndex);
end;
ID := AddStringRec(SR);
end;
{utility functions}
function TsdStringTable.StringCount: integer;
begin
Result := FBySR.Count;
end;
function sdStringToSR(const S: UTF8String): TsdStringRec;
begin
Result.Count := length(S);
if Result.Count = 0 then
Result.First := nil
else
Result.First := @S[1];
end;
function sdSRToString(const SR: TsdStringRec): UTF8String;
begin
SetLength(Result, SR.Count);
if SR.Count > 0 then
Move(SR.First^, Result[1], SR.Count);
end;
function sdCompareByte(Byte1, Byte2: byte): integer;
begin
if Byte1 < Byte2 then
Result := -1
else
if Byte1 > Byte2 then
Result := 1
else
Result := 0;
end;
function sdCompareInteger(Int1, Int2: integer): integer;
begin
if Int1 < Int2 then
Result := -1
else
if Int1 > Int2 then
Result := 1
else
Result := 0;
end;
function sdCompareSR(const SR1, SR2: TsdStringRec): integer;
var
Count: integer;
First1, First2, Last1, Last2: Pbyte;
begin
// Compare string length first
Result := sdCompareInteger(SR1.Count, SR2.Count);
if Result <> 0 then
exit;
// Compare first
Result := sdCompareByte(SR1.First^, SR2.First^);
if Result <> 0 then
exit;
Count := SR1.Count;
// Setup First & Last pointers
First1 := SR1.First;
First2 := SR2.First;
Last1 := First1; inc(Last1, Count);
Last2 := First2; inc(Last2, Count);
// Compare each time last ptrs then first ptrs, until they meet in the middle
repeat
dec(Last1);
dec(Last2);
if First1 = Last1 then
exit;
Result := sdCompareByte(Last1^, Last2^);
if Result <> 0 then
exit;
inc(First1); inc(First2);
if First1 = Last1 then
exit;
Result := sdCompareByte(First1^, First2^);
if Result <> 0 then
exit;
until False;
end;
function sdUtf16ToUtf8Mem(Src: Pword; Dst: Pbyte; Count: integer): integer;
// Convert an Unicode (UTF16 LE) memory block to UTF8. This routine will process
// Count wide characters (2 bytes size) to Count UTF8 characters (1-3 bytes).
// Therefore, the block at Dst must be at least 1.5 the size of the source block.
// The function returns the number of *bytes* written.
var
W: word;
DStart: Pbyte;
begin
DStart := Dst;
while Count > 0 do
begin
W := Src^;
inc(Src);
if W <= $7F then
begin
Dst^ := byte(W);
inc(Dst);
end else
begin
if W > $7FF then
begin
Dst^ := byte($E0 or (W shr 12));
inc(Dst);
Dst^ := byte($80 or ((W shr 6) and $3F));
inc(Dst);
Dst^ := byte($80 or (W and $3F));
inc(Dst);
end else
begin // $7F < W <= $7FF
Dst^ := byte($C0 or (W shr 6));
inc(Dst);
Dst^ := byte($80 or (W and $3F));
inc(Dst);
end;
end;
dec(Count);
end;
Result := integer(Dst) - integer(DStart);
end;
function sdUtf8ToUtf16Mem(var Src: Pbyte; Dst: Pword; Count: integer): integer;
// Convert an UTF8 memory block to Unicode (UTF16 LE). This routine will process
// Count *bytes* of UTF8 (each character 1-3 bytes) into UTF16 (each char 2 bytes).
// Therefore, the block at Dst must be at least 2 times the size of Count, since
// many UTF8 characters consist of just one byte, and are mapped to 2 bytes. The
// function returns the number of *wide chars* written. Note that the Src block must
// have an exact number of UTF8 characters in it, if Count doesn't match then
// the last character will be converted anyway (going past the block boundary!)
var
W: word;
C: byte;
DStart: Pword;
SClose: Pbyte;
begin
DStart := Dst;
SClose := Src;
inc(SClose, Count);
while integer(Src) < integer(SClose) do
begin
// 1st byte
W := Src^;
inc(Src);
if W and $80 <> 0 then
begin
W := W and $3F;
if W and $20 <> 0 then
begin
// 2nd byte
C := Src^;
inc(Src);
if C and $C0 <> $80 then
// malformed trail byte or out of range char
Continue;
W := (W shl 6) or (C and $3F);
end;
// 2nd or 3rd byte
C := Src^;
inc(Src);
if C and $C0 <> $80 then
// malformed trail byte
Continue;
Dst^ := (W shl 6) or (C and $3F);
inc(Dst);
end else
begin
Dst^ := W;
inc(Dst);
end;
end;
Result := (integer(Dst) - integer(DStart)) div 2;
end;
procedure sdStreamWrite(S: TStream; const AString: AnsiString);
var
L: integer;
begin
L := Length(AString);
if L > 0 then
begin
S.Write(AString[1], L);
end;
end;
procedure sdStreamWriteStringRec(S: TStream; const AStringRec: TsdStringRec);
begin
S.Write(PAnsiChar(AStringRec.First)^, AStringRec.Count);
end;
procedure sdStreamWriteRefString(S: TStream; ARefString: TsdRefString);
begin
if ARefString = nil then
exit;
S.Write(PAnsiChar(ARefString.FFirst)^, ARefString.FCharCount);
end;
end.