1 /++
2 Mir String Table designed for fast deserialization routines.
3 
4 License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0)
5 Authors: Ilya Yaroshenko 
6 Macros:
7 +/
8 module mir.string_table;
9 
10 /++
11 Fast string table used to get key's id.
12 The keys should be first sorted by length and then lexicographically.
13 Params:
14     U = an unsigned type that can hold an index of sorted keys. `U.max` must be less then length of the table.
15     C = character type
16 +/
17 struct MirStringTable(U, C = char)
18     if (__traits(isUnsigned, U) && (is(C == char) || is(C == wchar) || is(C == dchar)))
19 {
20     /++
21     Keys sorted by length and then lexicographically.
22     +/
23     const(immutable(C)[])[] sortedKeys;
24 
25     private U[] table;
26 
27     /++
28     The keys should be first sorted by length and then lexicographically.
29 
30     The constructor uses GC.
31     It can be used in `@nogc` code when if constructed in compile time.
32     +/
33     this()(const(immutable(C)[])[] sortedKeys)
34         @trusted pure nothrow
35     {
36         pragma(inline, false);
37         this.sortedKeys = sortedKeys;
38         assert(sortedKeys.length <= U.max);
39         const largest = sortedKeys.length ? sortedKeys[$ - 1].length : 0;
40         table = new U[largest + 2];
41         size_t ski;
42         foreach (length; 0 .. largest + 1)
43         {
44             while(ski < sortedKeys.length && sortedKeys[ski].length == length)
45                 ski++;
46             table[length + 1] = cast(U)ski;
47         }
48     }
49 
50     /++
51     Params:
52         key = string to find index for
53         index = (ref) index to fill with key's position.
54     Returns:
55         true if keys index has been found
56     +/
57     bool get()(scope const C[] key, ref uint index)
58          const @trusted pure nothrow @nogc
59     {
60         import mir.utility: _expect;
61         if (_expect(key.length + 1 < table.length, true))
62         {
63 
64             // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
65             // 0 1 2 3 4 5 6   8 9 10    12          16
66 
67             auto low = table[key.length] + 0u;
68             auto high = table[key.length + 1] + 0u;
69             auto items = sortedKeys.ptr;
70             if (low < high)
71             {
72                 version (none)
73                 {
74                     if (key.length == 0)
75                     {
76                         index = 0;
77                         return true;
78                     }
79                 }
80                 L: do {
81                     auto mid = (low + high) / 2;
82 
83                     version (all)
84                     {
85                         import core.stdc..string: memcmp;
86                         int r = void;
87 
88                         if (__ctfe)
89                             r = __cmp(key, items[mid]);
90                         else
91                         version (BigEndian)
92                             r = memcmp(key.ptr, items[mid].ptr, key.length);
93                         else
94                         static if (C.sizeof == 1)
95                             r = memcmp(key.ptr, items[mid].ptr, key.length);
96                         else
97                             r = __cmp(key, items[mid]);
98 
99                         if (r == 0)
100                         {
101                             index = mid;
102                             return true;
103                         }
104                         if (r > 0)
105                             low = mid + 1;
106                         else
107                             high = mid;
108                     }
109                     else
110                     {
111                         size_t i;
112                         auto value = items[mid];
113                         do {
114                             if (key[i] < value[i])
115                             {
116                                 high = mid;
117                                 continue L;
118                             }
119                             else
120                             if (key[i] > value[i])
121                             {
122                                 low = mid + 1;
123                                 continue L;
124                             }
125                         } while (++i < key.length);
126                         index = mid;
127                         return true;
128                     }
129                 }
130                 while(low < high);
131             }
132         }
133         return false;
134     }
135 
136     ///
137     uint opIndex()(scope const C[] key)
138          const @trusted pure nothrow @nogc
139     {
140         import mir.utility: _expect;
141         uint ret = 0;
142         if (get(key, ret)._expect(true))
143             return ret;
144         assert(0);
145     }
146 }
147 
148 /// ditto
149 struct MirStringTable(size_t length, size_t maxKeyLength, bool caseInsensetive = false, C = char)
150     if (is(C == char) || is(C == wchar) || is(C == dchar))
151 {
152     ///
153     const(immutable(C)[])[length] sortedKeys;
154 
155     private alias U = minimalIndexType!length;
156     private U[maxKeyLength + 2] table;
157 
158     /++
159     The keys should be first sorted by length and then lexicographically.
160 
161     The constructor uses GC.
162     It can be used in `@nogc` code when if constructed in compile time.
163     +/
164     this(immutable(C)[][length] sortedKeys)
165         @trusted pure nothrow
166     {
167         pragma(inline, false);
168         this.sortedKeys = sortedKeys;
169         size_t ski;
170         foreach (length; 0 .. maxKeyLength + 1)
171         {
172             while(ski < sortedKeys.length && sortedKeys[ski].length == length)
173                 ski++;
174             table[length + 1] = cast(U)ski;
175         }
176     }
177 
178     /++
179     Params:
180         key = string to find index for
181         index = (ref) index to fill with key's position.
182     Returns:
183         true if keys index has been found
184     +/
185     bool get()(scope const(C)[] key, ref uint index)
186          const @trusted pure nothrow @nogc
187     {
188         import mir.utility: _expect;
189         if (_expect(key.length <= maxKeyLength, true))
190         {
191             static if (caseInsensetive)
192             {
193                 C[maxKeyLength] buffer = void;
194                 foreach(i, C c; key)
195                     buffer[i] = c.fastToUpper;
196                 key = buffer[0 .. key.length];
197             }
198             auto low = table[key.length] + 0u;
199             auto high = table[key.length + 1] + 0u;
200             auto items = sortedKeys.ptr;
201             if (low < high)
202             {
203                 static if (!(maxKeyLength >= 16))
204                 {
205                     if (key.length == 0)
206                     {
207                         index = 0;
208                         return true;
209                     }
210                 }
211                 L: do {
212                     auto mid = (low + high) / 2;
213 
214                     static if (maxKeyLength >= 16)
215                     {
216                         import core.stdc..string: memcmp;
217                         int r = void;
218 
219                         if (__ctfe)
220                             r = __cmp(key, items[mid]);
221                         else
222                         version (BigEndian)
223                             r = memcmp(key.ptr, items[mid].ptr, key.length);
224                         else
225                         static if (C.sizeof == 1)
226                             r = memcmp(key.ptr, items[mid].ptr, key.length);
227                         else
228                             r = __cmp(key, items[mid]);
229 
230                         if (r == 0)
231                         {
232                             index = mid;
233                             return true;
234                         }
235                         if (r > 0)
236                             low = mid + 1;
237                         else
238                             high = mid;
239                     }
240                     else
241                     {
242                         size_t i;
243                         auto value = items[mid];
244                         do {
245                             if (key[i] < value[i])
246                             {
247                                 high = mid;
248                                 continue L;
249                             }
250                             else
251                             if (key[i] > value[i])
252                             {
253                                 low = mid + 1;
254                                 continue L;
255                             }
256                         } while (++i < key.length);
257                         index = mid;
258                         return true;
259                     }
260                 }
261                 while(low < high);
262             }
263         }
264         return false;
265     }
266 
267     ///
268     uint opIndex()(scope const C[] key)
269          const @trusted pure nothrow @nogc
270     {
271         import mir.utility: _expect;
272         uint ret = 0;
273         if (get(key, ret)._expect(true))
274             return ret;
275         assert(0);
276     }
277 }
278 
279 package auto fastToUpper(C)(const C a)
280 {   // std.ascii may not be inlined
281     return 'a' <= a && a <= 'z' ? cast(C)(a ^ 0x20) : a;
282 }
283 
284 package @safe pure nothrow @nogc
285 C[] fastToUpperInPlace(C)(scope return C[] a)
286 {
287     foreach(ref C e; a)
288         e = e.fastToUpper;
289     return a;
290 }
291 
292 package immutable(C)[][] prepareStringTableKeys(bool caseInsensetive = false, C)(immutable(C)[][] keys)
293 {
294     static if (caseInsensetive)
295     {
296         foreach (ref key; keys)
297         {
298             auto upper = cast(immutable) key.dup.fastToUpperInPlace;
299             if (upper != key)
300                 key = upper;
301         }
302     }
303     import mir.utility: simpleSort;
304     return keys.simpleSort!smallerStringFirst;
305 }
306 
307 package template createTable(C)
308     if (is(C == char) || is(C == wchar) || is(C == dchar))
309 {
310     auto createTable(immutable(C)[][] keys, bool caseInsensetive = false)()
311     {
312         static immutable C[][] sortedKeys = prepareStringTableKeys!caseInsensetive(keys);
313         alias Table = MirStringTable!(sortedKeys.length, sortedKeys.length ? sortedKeys[$ - 1].length : 0, caseInsensetive, C);
314         static if (sortedKeys.length)
315             return Table(sortedKeys[0 .. sortedKeys.length]);
316         else
317             return Table.init;
318     }
319 }
320 
321 ///
322 @safe pure nothrow @nogc
323 version(mir_core_test) unittest
324 {
325     static immutable sortedKeys = ["", "a", "b", "aab", "abb", "aaaaa"];
326     static immutable table = MirStringTable!ubyte(sortedKeys); // CTFE
327     static assert (table[""] == 0);
328     static assert (table["a"] == 1);
329     static assert (table["b"] == 2);
330     static assert (table["abb"] == 4);
331     assert (table["aaaaa"] == 5);
332 }
333 
334 
335 ///
336 @safe pure nothrow
337 version(mir_core_test) unittest
338 {
339     import mir.utility: simpleSort;
340     auto keys = ["aaaaa", "abb", "", "b", "a", "aab"];
341     // sorts keys by length and then lexicographically.
342     keys.simpleSort!smallerStringFirst;
343     assert(keys == ["", "a", "b", "aab", "abb", "aaaaa"]);
344 }
345 
346 @safe pure nothrow
347 version(mir_core_test) unittest
348 {
349     import mir.utility: simpleSort;
350     auto keys = ["aaaaa"w, "abb"w, ""w, "b"w, "a"w, "aab"w];
351     // sorts keys by length and then lexicographically.
352     keys.simpleSort!smallerStringFirst;
353     assert(keys == [""w, "a"w, "b"w, "aab"w, "abb"w, "aaaaa"w]);
354 }
355 
356 package template minimalIndexType(size_t length)
357 {
358     static if (length <= ubyte.max)
359         alias minimalIndexType = ubyte;
360     else
361     static if (length <= ushort.max)
362         alias minimalIndexType = ushort;
363     else
364     static if (length <= uint.max)
365         alias minimalIndexType = uint;
366     else
367         alias minimalIndexType = ulong;
368 }
369 
370 package template minimalSignedIndexType(size_t length)
371 {
372     static if (length <= byte.max)
373         alias minimalSignedIndexType = byte;
374     else
375     static if (length <= short.max)
376         alias minimalSignedIndexType = short;
377     else
378     static if (length <= int.max)
379         alias minimalSignedIndexType = int;
380     else
381         alias minimalSignedIndexType = long;
382 }
383 
384 /++
385 Compares strings by length and then lexicographically.
386 +/
387 sizediff_t smallerStringFirstCmp(T)(T[] a, T[] b)
388 {
389     if (sizediff_t d = a.length - b.length)
390     {
391         return d;
392     }
393 
394     import std.traits: Unqual;
395     static if (is(Unqual!T == ubyte) || is(Unqual!T == char))
396     {
397         import core.stdc..string: memcmp;
398         if (__ctfe)
399             return __cmp(a, b);
400         else
401             return (() @trusted => memcmp(a.ptr, b.ptr, a.length))();
402     }
403     else
404     {
405         return __cmp(a, b);
406     }
407 }
408 
409 ///
410 @safe pure nothrow @nogc
411 version(mir_core_test) unittest
412 {
413     assert(smallerStringFirstCmp("aa", "bb") < 0);
414     assert(smallerStringFirstCmp("aa", "aa") == 0);
415     assert(smallerStringFirstCmp("aaa", "aa") > 0);
416 
417     static assert(smallerStringFirstCmp("aa", "bb") < 0);
418     static assert(smallerStringFirstCmp("aa", "aa") == 0);
419     static assert(smallerStringFirstCmp("aaa", "aa") > 0);
420 }
421 
422 /++
423 Compares strings by length and then lexicographically.
424 +/
425 template smallerStringFirst(alias direction = "<")
426     if (direction == "<" || direction == ">")
427 {
428     ///
429     bool smallerStringFirst(T)(T[] a, T[] b)
430     {
431         auto r = smallerStringFirstCmp(a, b);
432         static if (direction == "<")
433             return r < 0;
434         else
435             return r > 0;
436     }
437 }
438 
439 ///
440 @safe pure nothrow @nogc
441 version(mir_core_test) unittest
442 {
443     assert(smallerStringFirst("aa", "bb") == true);
444     assert(smallerStringFirst("aa", "aa") == false);
445     assert(smallerStringFirst("aaa", "aa") == false);
446 }