Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 :
4 : // Copyright 2012 the V8 project authors. All rights reserved.
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following
13 : // disclaimer in the documentation and/or other materials provided
14 : // with the distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived
17 : // from this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : #include "irregexp/RegExpEngine.h"
32 :
33 : #include <utility>
34 :
35 : #include "gc/GC.h"
36 : #include "irregexp/NativeRegExpMacroAssembler.h"
37 : #include "irregexp/RegExpCharacters.h"
38 : #include "irregexp/RegExpMacroAssembler.h"
39 : #include "jit/ExecutableAllocator.h"
40 : #include "jit/JitCommon.h"
41 :
42 : #include "irregexp/RegExpCharacters-inl.h"
43 :
44 : using namespace js;
45 : using namespace js::irregexp;
46 :
47 : using mozilla::ArrayLength;
48 : using mozilla::DebugOnly;
49 : using mozilla::Maybe;
50 :
51 : #define DEFINE_ACCEPT(Type) \
52 : void Type##Node::Accept(NodeVisitor* visitor) { \
53 : visitor->Visit##Type(this); \
54 : }
55 2628 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
56 : #undef DEFINE_ACCEPT
57 :
58 0 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
59 165 : visitor->VisitLoopChoice(this);
60 165 : }
61 :
62 : static const int kMaxLookaheadForBoyerMoore = 8;
63 :
64 2793 : RegExpNode::RegExpNode(LifoAlloc* alloc)
65 0 : : replacement_(nullptr), trace_count_(0), alloc_(alloc)
66 : {
67 2793 : bm_info_[0] = bm_info_[1] = nullptr;
68 0 : }
69 :
70 : static const int kMaxOneByteCharCode = 0xff;
71 : static const int kMaxUtf16CodeUnit = 0xffff;
72 :
73 : static char16_t
74 : MaximumCharacter(bool latin1)
75 : {
76 3932 : return latin1 ? kMaxOneByteCharCode : kMaxUtf16CodeUnit;
77 : }
78 :
79 : static void
80 144 : AddClass(const int* elmv, int elmc,
81 : CharacterRangeVector* ranges)
82 : {
83 0 : elmc--;
84 0 : MOZ_ASSERT(elmv[elmc] == 0x10000);
85 0 : for (int i = 0; i < elmc; i += 2) {
86 939 : MOZ_ASSERT(elmv[i] < elmv[i + 1]);
87 0 : ranges->append(CharacterRange(elmv[i], elmv[i + 1] - 1));
88 : }
89 144 : }
90 :
91 : void
92 150 : js::irregexp::AddClassNegated(const int* elmv,
93 : int elmc,
94 : CharacterRangeVector* ranges)
95 : {
96 0 : elmc--;
97 0 : MOZ_ASSERT(elmv[elmc] == 0x10000);
98 150 : MOZ_ASSERT(elmv[0] != 0x0000);
99 0 : MOZ_ASSERT(elmv[elmc-1] != kMaxUtf16CodeUnit);
100 : char16_t last = 0x0000;
101 0 : for (int i = 0; i < elmc; i += 2) {
102 0 : MOZ_ASSERT(last <= elmv[i] - 1);
103 0 : MOZ_ASSERT(elmv[i] < elmv[i + 1]);
104 1084 : ranges->append(CharacterRange(last, elmv[i] - 1));
105 0 : last = elmv[i + 1];
106 : }
107 300 : ranges->append(CharacterRange(last, kMaxUtf16CodeUnit));
108 150 : }
109 :
110 : void
111 371 : CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type,
112 : CharacterRangeVector* ranges)
113 : {
114 0 : switch (type) {
115 : case 's':
116 79 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
117 0 : break;
118 : case 'S':
119 13 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
120 0 : break;
121 : case 'w':
122 28 : AddClass(kWordRanges, kWordRangeCount, ranges);
123 0 : break;
124 : case 'W':
125 1 : AddClassNegated(kWordRanges, kWordRangeCount, ranges);
126 0 : break;
127 : case 'd':
128 37 : AddClass(kDigitRanges, kDigitRangeCount, ranges);
129 0 : break;
130 : case 'D':
131 0 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
132 0 : break;
133 : case '.':
134 136 : AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
135 136 : break;
136 : // This is not a character range as defined by the spec but a
137 : // convenient shorthand for a character class that matches any
138 : // character.
139 : case '*':
140 77 : ranges->append(CharacterRange::Everything());
141 77 : break;
142 : // This is the set of characters matched by the $ and ^ symbols
143 : // in multiline mode.
144 : case 'n':
145 0 : AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges);
146 0 : break;
147 : default:
148 0 : MOZ_CRASH("Bad character class escape");
149 : }
150 371 : }
151 :
152 : // Add class escape, excluding surrogate pair range.
153 : void
154 0 : CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
155 : CharacterRangeVector* ranges, bool ignore_case)
156 : {
157 0 : switch (type) {
158 : case 's':
159 : case 'd':
160 0 : return AddClassEscape(alloc, type, ranges);
161 : break;
162 : case 'S':
163 0 : AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
164 0 : break;
165 : case 'w':
166 0 : if (ignore_case)
167 0 : AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges);
168 : else
169 0 : AddClassEscape(alloc, type, ranges);
170 : break;
171 : case 'W':
172 0 : if (ignore_case) {
173 : AddClass(kNegatedIgnoreCaseWordAndSurrogateRanges,
174 0 : kNegatedIgnoreCaseWordAndSurrogateRangeCount, ranges);
175 : } else {
176 0 : AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
177 : }
178 : break;
179 : case 'D':
180 0 : AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
181 0 : break;
182 : default:
183 0 : MOZ_CRASH("Bad type!");
184 : }
185 : }
186 :
187 : static bool
188 0 : RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
189 : {
190 0 : for (size_t i = 0; i < ranges.length(); i++) {
191 : // TODO(dcarney): this could be a lot more efficient.
192 0 : if (RangeContainsLatin1Equivalents(ranges[i], unicode))
193 : return true;
194 : }
195 : return false;
196 : }
197 :
198 : static const size_t kEcma262UnCanonicalizeMaxWidth = 4;
199 :
200 : // Returns the number of characters in the equivalence class, omitting those
201 : // that cannot occur in the source string if it is a one byte string.
202 : static MOZ_ALWAYS_INLINE int
203 883 : GetCaseIndependentLetters(char16_t character,
204 : bool latin1_subject,
205 : bool unicode,
206 : const char16_t* choices,
207 : size_t choices_length,
208 : char16_t* letters)
209 : {
210 0 : size_t count = 0;
211 5298 : for (size_t i = 0; i < choices_length; i++) {
212 4415 : char16_t c = choices[i];
213 :
214 : // Skip characters that can't appear in one byte strings.
215 4415 : if (!unicode && latin1_subject && c > kMaxOneByteCharCode)
216 : continue;
217 :
218 : // Watch for duplicates.
219 : bool found = false;
220 5443 : for (size_t j = 0; j < count; j++) {
221 3532 : if (letters[j] == c) {
222 : found = true;
223 : break;
224 : }
225 : }
226 4415 : if (found)
227 : continue;
228 :
229 1397 : letters[count++] = c;
230 : }
231 :
232 883 : return count;
233 : }
234 :
235 : static int
236 883 : GetCaseIndependentLetters(char16_t character,
237 : bool latin1_subject,
238 : bool unicode,
239 : char16_t* letters)
240 : {
241 883 : if (unicode) {
242 : const char16_t choices[] = {
243 : character,
244 0 : unicode::FoldCase(character),
245 0 : unicode::ReverseFoldCase1(character),
246 0 : unicode::ReverseFoldCase2(character),
247 0 : unicode::ReverseFoldCase3(character),
248 0 : };
249 0 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
250 0 : choices, ArrayLength(choices), letters);
251 : }
252 :
253 0 : char16_t upper = unicode::ToUpperCase(character);
254 0 : unicode::CodepointsWithSameUpperCase others(character);
255 0 : char16_t other1 = others.other1();
256 1766 : char16_t other2 = others.other2();
257 1766 : char16_t other3 = others.other3();
258 :
259 : // ES 2017 draft 996af87b7072b3c3dd2b1def856c66f456102215 21.2.4.2
260 : // step 3.g.
261 : // The standard requires that non-ASCII characters cannot have ASCII
262 : // character codes in their equivalence class, even though this
263 : // situation occurs multiple times in the Unicode tables.
264 : static const unsigned kMaxAsciiCharCode = 127;
265 883 : if (upper <= kMaxAsciiCharCode) {
266 883 : if (character > kMaxAsciiCharCode) {
267 : // If Canonicalize(character) == character, all other characters
268 : // should be ignored.
269 0 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
270 0 : &character, 1, letters);
271 : }
272 :
273 0 : if (other1 > kMaxAsciiCharCode)
274 0 : other1 = character;
275 0 : if (other2 > kMaxAsciiCharCode)
276 1 : other2 = character;
277 883 : if (other3 > kMaxAsciiCharCode)
278 0 : other3 = character;
279 : }
280 :
281 : const char16_t choices[] = {
282 : character,
283 : upper,
284 : other1,
285 : other2,
286 : other3
287 0 : };
288 883 : return GetCaseIndependentLetters(character, latin1_subject, unicode,
289 883 : choices, ArrayLength(choices), letters);
290 : }
291 :
292 : void
293 0 : CharacterRange::AddCaseEquivalents(bool is_latin1, bool unicode, CharacterRangeVector* ranges)
294 : {
295 83 : char16_t bottom = from();
296 0 : char16_t top = to();
297 :
298 83 : if (is_latin1 && !RangeContainsLatin1Equivalents(*this, unicode)) {
299 0 : if (bottom > kMaxOneByteCharCode)
300 : return;
301 57 : if (top > kMaxOneByteCharCode)
302 0 : top = kMaxOneByteCharCode;
303 : } else {
304 : // Nothing to do for surrogates.
305 26 : if (bottom >= unicode::LeadSurrogateMin && top <= unicode::TrailSurrogateMax)
306 : return;
307 : }
308 :
309 0 : for (char16_t c = bottom;; c++) {
310 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
311 0 : size_t length = GetCaseIndependentLetters(c, is_latin1, unicode, chars);
312 :
313 0 : for (size_t i = 0; i < length; i++) {
314 1254 : char16_t other = chars[i];
315 1254 : if (other == c)
316 : continue;
317 :
318 : // Try to combine with an existing range.
319 : bool found = false;
320 0 : for (size_t i = 0; i < ranges->length(); i++) {
321 2094 : CharacterRange& range = (*ranges)[i];
322 4188 : if (range.Contains(other)) {
323 : found = true;
324 : break;
325 0 : } else if (other == range.from() - 1) {
326 0 : range.set_from(other);
327 0 : found = true;
328 0 : break;
329 0 : } else if (other == range.to() + 1) {
330 0 : range.set_to(other);
331 250 : found = true;
332 250 : break;
333 : }
334 : }
335 :
336 480 : if (!found)
337 22 : ranges->append(CharacterRange::Singleton(other));
338 : }
339 :
340 0 : if (c == top)
341 : break;
342 691 : }
343 : }
344 :
345 : static bool
346 0 : CompareInverseRanges(const CharacterRangeVector& ranges, const int* special_class, size_t length)
347 : {
348 0 : length--; // Remove final 0x10000.
349 0 : MOZ_ASSERT(special_class[length] == 0x10000);
350 0 : MOZ_ASSERT(ranges.length() != 0);
351 0 : MOZ_ASSERT(length != 0);
352 418 : MOZ_ASSERT(special_class[0] != 0);
353 0 : if (ranges.length() != (length >> 1) + 1)
354 : return false;
355 85 : CharacterRange range = ranges[0];
356 0 : if (range.from() != 0)
357 : return false;
358 350 : for (size_t i = 0; i < length; i += 2) {
359 0 : if (special_class[i] != (range.to() + 1))
360 : return false;
361 306 : range = ranges[(i >> 1) + 1];
362 153 : if (special_class[i+1] != range.from())
363 : return false;
364 : }
365 0 : if (range.to() != 0xffff)
366 : return false;
367 44 : return true;
368 : }
369 :
370 : static bool
371 0 : CompareRanges(const CharacterRangeVector& ranges, const int* special_class, size_t length)
372 : {
373 0 : length--; // Remove final 0x10000.
374 361 : MOZ_ASSERT(special_class[length] == 0x10000);
375 0 : if (ranges.length() * 2 != length)
376 : return false;
377 0 : for (size_t i = 0; i < length; i += 2) {
378 216 : CharacterRange range = ranges[i >> 1];
379 108 : if (range.from() != special_class[i] || range.to() != special_class[i + 1] - 1)
380 : return false;
381 : }
382 : return true;
383 : }
384 :
385 : bool
386 201 : RegExpCharacterClass::is_standard(LifoAlloc* alloc)
387 : {
388 : // TODO(lrn): Remove need for this function, by not throwing away information
389 : // along the way.
390 0 : if (is_negated_)
391 : return false;
392 0 : if (set_.is_standard())
393 : return true;
394 0 : if (CompareRanges(set_.ranges(alloc), kSpaceRanges, kSpaceRangeCount)) {
395 10 : set_.set_standard_set_type('s');
396 0 : return true;
397 : }
398 0 : if (CompareInverseRanges(set_.ranges(alloc), kSpaceRanges, kSpaceRangeCount)) {
399 6 : set_.set_standard_set_type('S');
400 0 : return true;
401 : }
402 145 : if (CompareInverseRanges(set_.ranges(alloc),
403 : kLineTerminatorRanges,
404 : kLineTerminatorRangeCount)) {
405 82 : set_.set_standard_set_type('.');
406 0 : return true;
407 : }
408 104 : if (CompareRanges(set_.ranges(alloc),
409 : kLineTerminatorRanges,
410 : kLineTerminatorRangeCount)) {
411 0 : set_.set_standard_set_type('n');
412 0 : return true;
413 : }
414 0 : if (CompareRanges(set_.ranges(alloc), kWordRanges, kWordRangeCount)) {
415 12 : set_.set_standard_set_type('w');
416 0 : return true;
417 : }
418 0 : if (CompareInverseRanges(set_.ranges(alloc), kWordRanges, kWordRangeCount)) {
419 0 : set_.set_standard_set_type('W');
420 0 : return true;
421 : }
422 : return false;
423 : }
424 :
425 : bool
426 0 : CharacterRange::IsCanonical(const CharacterRangeVector& ranges)
427 : {
428 566 : int n = ranges.length();
429 566 : if (n <= 1)
430 : return true;
431 :
432 0 : int max = ranges[0].to();
433 0 : for (int i = 1; i < n; i++) {
434 1816 : CharacterRange next_range = ranges[i];
435 0 : if (next_range.from() <= max + 1)
436 : return false;
437 862 : max = next_range.to();
438 : }
439 : return true;
440 : }
441 :
442 : // Move a number of elements in a zonelist to another position
443 : // in the same list. Handles overlapping source and target areas.
444 : static
445 74 : void MoveRanges(CharacterRangeVector& list, int from, int to, int count)
446 : {
447 : // Ranges are potentially overlapping.
448 0 : if (from < to) {
449 193 : for (int i = count - 1; i >= 0; i--)
450 0 : list[to + i] = list[from + i];
451 : } else {
452 12 : for (int i = 0; i < count; i++)
453 0 : list[to + i] = list[from + i];
454 : }
455 74 : }
456 :
457 : static int
458 107 : InsertRangeInCanonicalList(CharacterRangeVector& list,
459 : int count,
460 : CharacterRange insert)
461 : {
462 : // Inserts a range into list[0..count[, which must be sorted
463 : // by from value and non-overlapping and non-adjacent, using at most
464 : // list[0..count] for the result. Returns the number of resulting
465 : // canonicalized ranges. Inserting a range may collapse existing ranges into
466 : // fewer ranges, so the return value can be anything in the range 1..count+1.
467 0 : char16_t from = insert.from();
468 0 : char16_t to = insert.to();
469 0 : int start_pos = 0;
470 0 : int end_pos = count;
471 0 : for (int i = count - 1; i >= 0; i--) {
472 516 : CharacterRange current = list[i];
473 0 : if (current.from() > to + 1) {
474 : end_pos = i;
475 0 : } else if (current.to() + 1 < from) {
476 63 : start_pos = i + 1;
477 63 : break;
478 : }
479 : }
480 :
481 : // Inserted range overlaps, or is adjacent to, ranges at positions
482 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
483 : // not affected by the insertion.
484 : // If start_pos == end_pos, the range must be inserted before start_pos.
485 : // if start_pos < end_pos, the entire range from start_pos to end_pos
486 : // must be merged with the insert range.
487 :
488 0 : if (start_pos == end_pos) {
489 : // Insert between existing ranges at position start_pos.
490 83 : if (start_pos < count) {
491 0 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
492 : }
493 166 : list[start_pos] = insert;
494 0 : return count + 1;
495 : }
496 0 : if (start_pos + 1 == end_pos) {
497 : // Replace single existing range at position start_pos.
498 0 : CharacterRange to_replace = list[start_pos];
499 0 : int new_from = Min(to_replace.from(), from);
500 44 : int new_to = Max(to_replace.to(), to);
501 44 : list[start_pos] = CharacterRange(new_from, new_to);
502 : return count;
503 : }
504 : // Replace a number of existing ranges from start_pos to end_pos - 1.
505 : // Move the remaining ranges down.
506 :
507 0 : int new_from = Min(list[start_pos].from(), from);
508 0 : int new_to = Max(list[end_pos - 1].to(), to);
509 2 : if (end_pos < count) {
510 0 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
511 : }
512 4 : list[start_pos] = CharacterRange(new_from, new_to);
513 2 : return count - (end_pos - start_pos) + 1;
514 : }
515 :
516 : void
517 0 : CharacterRange::Canonicalize(CharacterRangeVector& character_ranges)
518 : {
519 46 : if (character_ranges.length() <= 1) return;
520 : // Check whether ranges are already canonical (increasing, non-overlapping,
521 : // non-adjacent).
522 0 : int n = character_ranges.length();
523 0 : int max = character_ranges[0].to();
524 0 : int i = 1;
525 0 : while (i < n) {
526 210 : CharacterRange current = character_ranges[i];
527 105 : if (current.from() <= max + 1) {
528 : break;
529 : }
530 59 : max = current.to();
531 59 : i++;
532 : }
533 : // Canonical until the i'th range. If that's all of them, we are done.
534 46 : if (i == n) return;
535 :
536 : // The ranges at index i and forward are not canonicalized. Make them so by
537 : // doing the equivalent of insertion sort (inserting each into the previous
538 : // list, in order).
539 : // Notice that inserting a range can reduce the number of ranges in the
540 : // result due to combining of adjacent and overlapping ranges.
541 46 : int read = i; // Range to insert.
542 0 : size_t num_canonical = i; // Length of canonicalized part of list.
543 : do {
544 0 : num_canonical = InsertRangeInCanonicalList(character_ranges,
545 : num_canonical,
546 0 : character_ranges[read]);
547 107 : read++;
548 0 : } while (read < n);
549 :
550 72 : while (character_ranges.length() > num_canonical)
551 : character_ranges.popBack();
552 :
553 46 : MOZ_ASSERT(CharacterRange::IsCanonical(character_ranges));
554 : }
555 :
556 : // -------------------------------------------------------------------
557 : // SeqRegExpNode
558 :
559 : class VisitMarker
560 : {
561 : public:
562 2734 : explicit VisitMarker(NodeInfo* info)
563 0 : : info_(info)
564 : {
565 0 : MOZ_ASSERT(!info->visited);
566 2734 : info->visited = true;
567 0 : }
568 : ~VisitMarker() {
569 2734 : info_->visited = false;
570 : }
571 : private:
572 : NodeInfo* info_;
573 : };
574 :
575 : bool
576 0 : SeqRegExpNode::FillInBMInfo(int offset,
577 : int budget,
578 : BoyerMooreLookahead* bm,
579 : bool not_at_start)
580 : {
581 0 : if (!bm->CheckOverRecursed())
582 : return false;
583 0 : if (!on_success_->FillInBMInfo(offset, budget - 1, bm, not_at_start))
584 : return false;
585 0 : if (offset == 0)
586 0 : set_bm_info(not_at_start, bm);
587 : return true;
588 : }
589 :
590 : RegExpNode*
591 0 : SeqRegExpNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
592 : {
593 1949 : if (info()->replacement_calculated)
594 0 : return replacement();
595 :
596 1141 : if (depth < 0)
597 0 : return this;
598 :
599 0 : MOZ_ASSERT(!info()->visited);
600 2282 : VisitMarker marker(info());
601 2282 : return FilterSuccessor(depth - 1, ignore_case, unicode);
602 : }
603 :
604 : RegExpNode*
605 0 : SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case, bool unicode)
606 : {
607 0 : RegExpNode* next = on_success_->FilterLATIN1(depth - 1, ignore_case, unicode);
608 2130 : if (next == nullptr)
609 0 : return set_replacement(nullptr);
610 :
611 2130 : on_success_ = next;
612 4260 : return set_replacement(this);
613 : }
614 :
615 : // -------------------------------------------------------------------
616 : // ActionNode
617 :
618 : int
619 0 : ActionNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
620 : {
621 0 : if (budget <= 0)
622 : return 0;
623 0 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS)
624 : return 0; // Rewinds input!
625 0 : return on_success()->EatsAtLeast(still_to_find,
626 : budget - 1,
627 2678 : not_at_start);
628 : }
629 :
630 : bool
631 279 : ActionNode::FillInBMInfo(int offset,
632 : int budget,
633 : BoyerMooreLookahead* bm,
634 : bool not_at_start)
635 : {
636 279 : if (!bm->CheckOverRecursed())
637 : return false;
638 :
639 279 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
640 279 : if (!on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
641 : return false;
642 : }
643 279 : SaveBMInfo(bm, not_at_start, offset);
644 :
645 : return true;
646 : }
647 :
648 : /* static */ ActionNode*
649 0 : ActionNode::SetRegister(int reg,
650 : int val,
651 : RegExpNode* on_success)
652 : {
653 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(SET_REGISTER, on_success);
654 1 : result->data_.u_store_register.reg = reg;
655 31 : result->data_.u_store_register.value = val;
656 0 : return result;
657 : }
658 :
659 : /* static */ ActionNode*
660 0 : ActionNode::IncrementRegister(int reg, RegExpNode* on_success)
661 : {
662 1 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(INCREMENT_REGISTER, on_success);
663 31 : result->data_.u_increment_register.reg = reg;
664 0 : return result;
665 : }
666 :
667 : /* static */ ActionNode*
668 0 : ActionNode::StorePosition(int reg, bool is_capture, RegExpNode* on_success)
669 : {
670 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(STORE_POSITION, on_success);
671 1 : result->data_.u_position_register.reg = reg;
672 928 : result->data_.u_position_register.is_capture = is_capture;
673 0 : return result;
674 : }
675 :
676 : /* static */ ActionNode*
677 0 : ActionNode::ClearCaptures(Interval range, RegExpNode* on_success)
678 : {
679 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(CLEAR_CAPTURES, on_success);
680 1 : result->data_.u_clear_captures.range_from = range.from();
681 15 : result->data_.u_clear_captures.range_to = range.to();
682 0 : return result;
683 : }
684 :
685 : /* static */ ActionNode*
686 0 : ActionNode::BeginSubmatch(int stack_pointer_reg, int position_reg, RegExpNode* on_success)
687 : {
688 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(BEGIN_SUBMATCH, on_success);
689 1 : result->data_.u_submatch.stack_pointer_register = stack_pointer_reg;
690 11 : result->data_.u_submatch.current_position_register = position_reg;
691 0 : return result;
692 : }
693 :
694 : /* static */ ActionNode*
695 0 : ActionNode::PositiveSubmatchSuccess(int stack_pointer_reg,
696 : int restore_reg,
697 : int clear_capture_count,
698 : int clear_capture_from,
699 : RegExpNode* on_success)
700 : {
701 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(POSITIVE_SUBMATCH_SUCCESS, on_success);
702 0 : result->data_.u_submatch.stack_pointer_register = stack_pointer_reg;
703 0 : result->data_.u_submatch.current_position_register = restore_reg;
704 1 : result->data_.u_submatch.clear_register_count = clear_capture_count;
705 10 : result->data_.u_submatch.clear_register_from = clear_capture_from;
706 0 : return result;
707 : }
708 :
709 : /* static */ ActionNode*
710 0 : ActionNode::EmptyMatchCheck(int start_register,
711 : int repetition_register,
712 : int repetition_limit,
713 : RegExpNode* on_success)
714 : {
715 0 : ActionNode* result = on_success->alloc()->newInfallible<ActionNode>(EMPTY_MATCH_CHECK, on_success);
716 0 : result->data_.u_empty_match_check.start_register = start_register;
717 0 : result->data_.u_empty_match_check.repetition_register = repetition_register;
718 0 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
719 0 : return result;
720 : }
721 :
722 : // -------------------------------------------------------------------
723 : // TextNode
724 :
725 : int
726 0 : TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
727 : {
728 1745 : int answer = Length();
729 0 : if (answer >= still_to_find)
730 : return answer;
731 400 : if (budget <= 0)
732 : return answer;
733 :
734 : // We are not at start after this node so we set the last argument to 'true'.
735 0 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
736 : budget - 1,
737 800 : true);
738 : }
739 :
740 : int
741 0 : TextNode::GreedyLoopTextLength()
742 : {
743 891 : TextElement elm = elements()[elements().length() - 1];
744 297 : return elm.cp_offset() + elm.length();
745 : }
746 :
747 : RegExpNode*
748 0 : TextNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
749 : {
750 1018 : if (info()->replacement_calculated)
751 0 : return replacement();
752 :
753 989 : if (depth < 0)
754 0 : return this;
755 :
756 0 : MOZ_ASSERT(!info()->visited);
757 0 : VisitMarker marker(info());
758 0 : int element_count = elements().length();
759 0 : for (int i = 0; i < element_count; i++) {
760 0 : TextElement elm = elements()[i];
761 0 : if (elm.text_type() == TextElement::ATOM) {
762 0 : CharacterVector& quarks = const_cast<CharacterVector&>(elm.atom()->data());
763 0 : for (size_t j = 0; j < quarks.length(); j++) {
764 21008 : uint16_t c = quarks[j];
765 0 : if (c <= kMaxOneByteCharCode)
766 : continue;
767 0 : if (!ignore_case)
768 0 : return set_replacement(nullptr);
769 :
770 : // Here, we need to check for characters whose upper and lower cases
771 : // are outside the Latin-1 range.
772 0 : char16_t converted = ConvertNonLatin1ToLatin1(c, unicode);
773 0 : if (converted == 0) {
774 : // Character is outside Latin-1 completely
775 0 : return set_replacement(nullptr);
776 : }
777 :
778 : // Convert quark to Latin-1 in place.
779 0 : quarks[j] = converted;
780 : }
781 : } else {
782 190 : MOZ_ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
783 0 : RegExpCharacterClass* cc = elm.char_class();
784 :
785 0 : CharacterRangeVector& ranges = cc->ranges(alloc());
786 190 : if (!CharacterRange::IsCanonical(ranges))
787 18 : CharacterRange::Canonicalize(ranges);
788 :
789 : // Now they are in order so we only need to look at the first.
790 0 : int range_count = ranges.length();
791 0 : if (cc->is_negated()) {
792 0 : if (range_count != 0 &&
793 26 : ranges[0].from() == 0 &&
794 0 : ranges[0].to() >= kMaxOneByteCharCode)
795 : {
796 : // This will be handled in a later filter.
797 0 : if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
798 0 : continue;
799 0 : return set_replacement(nullptr);
800 : }
801 : } else {
802 354 : if (range_count == 0 ||
803 177 : ranges[0].from() > kMaxOneByteCharCode)
804 : {
805 : // This will be handled in a later filter.
806 0 : if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
807 : continue;
808 0 : return set_replacement(nullptr);
809 : }
810 : }
811 : }
812 : }
813 989 : return FilterSuccessor(depth - 1, ignore_case, unicode);
814 : }
815 :
816 : void
817 0 : TextNode::CalculateOffsets()
818 : {
819 2156 : int element_count = elements().length();
820 :
821 : // Set up the offsets of the elements relative to the start. This is a fixed
822 : // quantity since a TextNode can only contain fixed-width things.
823 0 : int cp_offset = 0;
824 0 : for (int i = 0; i < element_count; i++) {
825 0 : TextElement& elm = elements()[i];
826 2216 : elm.set_cp_offset(cp_offset);
827 0 : cp_offset += elm.length();
828 : }
829 0 : }
830 :
831 0 : void TextNode::MakeCaseIndependent(bool is_latin1, bool unicode)
832 : {
833 0 : int element_count = elements().length();
834 0 : for (int i = 0; i < element_count; i++) {
835 0 : TextElement elm = elements()[i];
836 56 : if (elm.text_type() == TextElement::CHAR_CLASS) {
837 30 : RegExpCharacterClass* cc = elm.char_class();
838 :
839 : // None of the standard character classes is different in the case
840 : // independent case and it slows us down if we don't know that.
841 30 : if (cc->is_standard(alloc()))
842 3 : continue;
843 :
844 : // Similarly, there's nothing to do for the character class
845 : // containing all characters except line terminators and surrogates.
846 : // This one is added by UnicodeEverythingAtom.
847 54 : CharacterRangeVector& ranges = cc->ranges(alloc());
848 27 : if (CompareInverseRanges(ranges,
849 : kLineTerminatorAndSurrogateRanges,
850 : kLineTerminatorAndSurrogateRangeCount))
851 : {
852 : continue;
853 : }
854 :
855 0 : int range_count = ranges.length();
856 110 : for (int j = 0; j < range_count; j++)
857 166 : ranges[j].AddCaseEquivalents(is_latin1, unicode, &ranges);
858 : }
859 : }
860 56 : }
861 :
862 : // -------------------------------------------------------------------
863 : // AssertionNode
864 :
865 : int
866 0 : AssertionNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
867 : {
868 322 : if (budget <= 0)
869 : return 0;
870 :
871 : // If we know we are not at the start and we are asked "how many characters
872 : // will you match if you succeed?" then we can answer anything since false
873 : // implies false. So lets just return the max answer (still_to_find) since
874 : // that won't prevent us from preloading a lot of characters for the other
875 : // branches in the node graph.
876 322 : if (assertion_type() == AT_START && not_at_start)
877 : return still_to_find;
878 :
879 284 : return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
880 : }
881 :
882 : bool
883 0 : AssertionNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start)
884 : {
885 23 : if (!bm->CheckOverRecursed())
886 : return false;
887 :
888 : // Match the behaviour of EatsAtLeast on this node.
889 23 : if (assertion_type() == AT_START && not_at_start)
890 : return true;
891 :
892 0 : if (!on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
893 : return false;
894 23 : SaveBMInfo(bm, not_at_start, offset);
895 : return true;
896 : }
897 :
898 : // -------------------------------------------------------------------
899 : // BackReferenceNode
900 :
901 : int
902 0 : BackReferenceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
903 : {
904 0 : if (budget <= 0)
905 : return 0;
906 0 : return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
907 : }
908 :
909 : bool
910 0 : BackReferenceNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start)
911 : {
912 : // Working out the set of characters that a backreference can match is too
913 : // hard, so we just say that any character can match.
914 0 : bm->SetRest(offset);
915 0 : SaveBMInfo(bm, not_at_start, offset);
916 0 : return true;
917 : }
918 :
919 : // -------------------------------------------------------------------
920 : // ChoiceNode
921 :
922 : int
923 651 : ChoiceNode::EatsAtLeastHelper(int still_to_find,
924 : int budget,
925 : RegExpNode* ignore_this_node,
926 : bool not_at_start)
927 : {
928 651 : if (budget <= 0)
929 : return 0;
930 :
931 0 : int min = 100;
932 0 : size_t choice_count = alternatives().length();
933 0 : budget = (budget - 1) / choice_count;
934 0 : for (size_t i = 0; i < choice_count; i++) {
935 4536 : RegExpNode* node = alternatives()[i].node();
936 0 : if (node == ignore_this_node) continue;
937 : int node_eats_at_least =
938 0 : node->EatsAtLeast(still_to_find, budget, not_at_start);
939 0 : if (node_eats_at_least < min)
940 808 : min = node_eats_at_least;
941 1935 : if (min == 0)
942 : return 0;
943 : }
944 : return min;
945 : }
946 :
947 : int
948 0 : ChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
949 : {
950 316 : return EatsAtLeastHelper(still_to_find,
951 : budget,
952 : nullptr,
953 632 : not_at_start);
954 : }
955 :
956 : void
957 323 : ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
958 : RegExpCompiler* compiler,
959 : int characters_filled_in,
960 : bool not_at_start)
961 : {
962 0 : not_at_start = (not_at_start || not_at_start_);
963 0 : int choice_count = alternatives().length();
964 323 : MOZ_ASSERT(choice_count > 0);
965 969 : alternatives()[0].node()->GetQuickCheckDetails(details,
966 : compiler,
967 : characters_filled_in,
968 0 : not_at_start);
969 0 : for (int i = 1; i < choice_count; i++) {
970 1910 : QuickCheckDetails new_details(details->characters());
971 1910 : RegExpNode* node = alternatives()[i].node();
972 : node->GetQuickCheckDetails(&new_details, compiler,
973 : characters_filled_in,
974 0 : not_at_start);
975 : // Here we merge the quick match details of the two branches.
976 0 : details->Merge(&new_details, characters_filled_in);
977 : }
978 323 : }
979 :
980 : bool
981 69 : ChoiceNode::FillInBMInfo(int offset,
982 : int budget,
983 : BoyerMooreLookahead* bm,
984 : bool not_at_start)
985 : {
986 69 : if (!bm->CheckOverRecursed())
987 : return false;
988 :
989 0 : const GuardedAlternativeVector& alts = alternatives();
990 0 : budget = (budget - 1) / alts.length();
991 0 : for (size_t i = 0; i < alts.length(); i++) {
992 0 : const GuardedAlternative& alt = alts[i];
993 0 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
994 3 : bm->SetRest(offset); // Give up trying to fill in info.
995 3 : SaveBMInfo(bm, not_at_start, offset);
996 : return true;
997 : }
998 378 : if (!alt.node()->FillInBMInfo(offset, budget, bm, not_at_start))
999 : return false;
1000 : }
1001 66 : SaveBMInfo(bm, not_at_start, offset);
1002 : return true;
1003 : }
1004 :
1005 : RegExpNode*
1006 0 : ChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1007 : {
1008 0 : if (info()->replacement_calculated)
1009 1 : return replacement();
1010 0 : if (depth < 0)
1011 0 : return this;
1012 0 : if (info()->visited)
1013 0 : return this;
1014 546 : VisitMarker marker(info());
1015 0 : int choice_count = alternatives().length();
1016 :
1017 0 : for (int i = 0; i < choice_count; i++) {
1018 0 : const GuardedAlternative alternative = alternatives()[i];
1019 0 : if (alternative.guards() != nullptr && alternative.guards()->length() != 0) {
1020 46 : set_replacement(this);
1021 23 : return this;
1022 : }
1023 : }
1024 :
1025 : int surviving = 0;
1026 : RegExpNode* survivor = nullptr;
1027 2246 : for (int i = 0; i < choice_count; i++) {
1028 0 : GuardedAlternative alternative = alternatives()[i];
1029 : RegExpNode* replacement =
1030 0 : alternative.node()->FilterLATIN1(depth - 1, ignore_case, unicode);
1031 0 : MOZ_ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
1032 0 : if (replacement != nullptr) {
1033 0 : alternatives()[i].set_node(replacement);
1034 998 : surviving++;
1035 998 : survivor = replacement;
1036 : }
1037 : }
1038 250 : if (surviving < 2)
1039 0 : return set_replacement(survivor);
1040 :
1041 500 : set_replacement(this);
1042 250 : if (surviving == choice_count)
1043 : return this;
1044 :
1045 : // Only some of the nodes survived the filtering. We need to rebuild the
1046 : // alternatives list.
1047 0 : GuardedAlternativeVector new_alternatives(*alloc());
1048 0 : new_alternatives.reserve(surviving);
1049 0 : for (int i = 0; i < choice_count; i++) {
1050 : RegExpNode* replacement =
1051 0 : alternatives()[i].node()->FilterLATIN1(depth - 1, ignore_case, unicode);
1052 0 : if (replacement != nullptr) {
1053 0 : alternatives()[i].set_node(replacement);
1054 0 : new_alternatives.append(alternatives()[i]);
1055 : }
1056 : }
1057 :
1058 0 : alternatives_ = std::move(new_alternatives);
1059 0 : return this;
1060 : }
1061 :
1062 : // -------------------------------------------------------------------
1063 : // NegativeLookaheadChoiceNode
1064 :
1065 : bool
1066 0 : NegativeLookaheadChoiceNode::FillInBMInfo(int offset,
1067 : int budget,
1068 : BoyerMooreLookahead* bm,
1069 : bool not_at_start)
1070 : {
1071 0 : if (!bm->CheckOverRecursed())
1072 : return false;
1073 :
1074 0 : if (!alternatives()[1].node()->FillInBMInfo(offset, budget - 1, bm, not_at_start))
1075 : return false;
1076 0 : if (offset == 0)
1077 0 : set_bm_info(not_at_start, bm);
1078 : return true;
1079 : }
1080 :
1081 : int
1082 0 : NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
1083 : {
1084 3 : if (budget <= 0)
1085 : return 0;
1086 :
1087 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1088 : // afterwards.
1089 6 : RegExpNode* node = alternatives()[1].node();
1090 3 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1091 : }
1092 :
1093 : void
1094 0 : NegativeLookaheadChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1095 : RegExpCompiler* compiler,
1096 : int filled_in,
1097 : bool not_at_start)
1098 : {
1099 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1100 : // afterwards.
1101 0 : RegExpNode* node = alternatives()[1].node();
1102 0 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1103 : }
1104 :
1105 : RegExpNode*
1106 0 : NegativeLookaheadChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1107 : {
1108 0 : if (info()->replacement_calculated)
1109 0 : return replacement();
1110 0 : if (depth < 0)
1111 0 : return this;
1112 1 : if (info()->visited)
1113 0 : return this;
1114 :
1115 2 : VisitMarker marker(info());
1116 :
1117 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
1118 : // afterwards.
1119 2 : RegExpNode* node = alternatives()[1].node();
1120 0 : RegExpNode* replacement = node->FilterLATIN1(depth - 1, ignore_case, unicode);
1121 :
1122 0 : if (replacement == nullptr)
1123 0 : return set_replacement(nullptr);
1124 0 : alternatives()[1].set_node(replacement);
1125 :
1126 2 : RegExpNode* neg_node = alternatives()[0].node();
1127 1 : RegExpNode* neg_replacement = neg_node->FilterLATIN1(depth - 1, ignore_case, unicode);
1128 :
1129 : // If the negative lookahead is always going to fail then
1130 : // we don't need to check it.
1131 1 : if (neg_replacement == nullptr)
1132 0 : return set_replacement(replacement);
1133 :
1134 3 : alternatives()[0].set_node(neg_replacement);
1135 2 : return set_replacement(this);
1136 : }
1137 :
1138 : // -------------------------------------------------------------------
1139 : // LoopChoiceNode
1140 :
1141 : void
1142 0 : GuardedAlternative::AddGuard(LifoAlloc* alloc, Guard* guard)
1143 : {
1144 0 : if (guards_ == nullptr)
1145 0 : guards_ = alloc->newInfallible<GuardVector>(*alloc);
1146 47 : guards_->append(guard);
1147 47 : }
1148 :
1149 : void
1150 0 : LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt)
1151 : {
1152 0 : MOZ_ASSERT(loop_node_ == nullptr);
1153 0 : AddAlternative(alt);
1154 165 : loop_node_ = alt.node();
1155 165 : }
1156 :
1157 :
1158 : void
1159 0 : LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt)
1160 : {
1161 0 : MOZ_ASSERT(continue_node_ == nullptr);
1162 0 : AddAlternative(alt);
1163 165 : continue_node_ = alt.node();
1164 165 : }
1165 :
1166 : int
1167 0 : LoopChoiceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start)
1168 : {
1169 670 : return EatsAtLeastHelper(still_to_find,
1170 : budget - 1,
1171 : loop_node_,
1172 670 : not_at_start);
1173 : }
1174 :
1175 : void
1176 286 : LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1177 : RegExpCompiler* compiler,
1178 : int characters_filled_in,
1179 : bool not_at_start)
1180 : {
1181 0 : if (body_can_be_zero_length_ || info()->visited)
1182 : return;
1183 384 : VisitMarker marker(info());
1184 192 : return ChoiceNode::GetQuickCheckDetails(details,
1185 : compiler,
1186 : characters_filled_in,
1187 192 : not_at_start);
1188 : }
1189 :
1190 : bool
1191 33 : LoopChoiceNode::FillInBMInfo(int offset,
1192 : int budget,
1193 : BoyerMooreLookahead* bm,
1194 : bool not_at_start)
1195 : {
1196 0 : if (body_can_be_zero_length_ || budget <= 0) {
1197 1 : bm->SetRest(offset);
1198 1 : SaveBMInfo(bm, not_at_start, offset);
1199 : return true;
1200 : }
1201 0 : if (!ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start))
1202 : return false;
1203 32 : SaveBMInfo(bm, not_at_start, offset);
1204 : return true;
1205 : }
1206 :
1207 : RegExpNode*
1208 0 : LoopChoiceNode::FilterLATIN1(int depth, bool ignore_case, bool unicode)
1209 : {
1210 0 : if (info()->replacement_calculated)
1211 1 : return replacement();
1212 0 : if (depth < 0)
1213 0 : return this;
1214 253 : if (info()->visited)
1215 115 : return this;
1216 :
1217 : {
1218 414 : VisitMarker marker(info());
1219 :
1220 : RegExpNode* continue_replacement =
1221 138 : continue_node_->FilterLATIN1(depth - 1, ignore_case, unicode);
1222 :
1223 : // If we can't continue after the loop then there is no sense in doing the
1224 : // loop.
1225 138 : if (continue_replacement == nullptr)
1226 0 : return set_replacement(nullptr);
1227 : }
1228 :
1229 138 : return ChoiceNode::FilterLATIN1(depth - 1, ignore_case, unicode);
1230 : }
1231 :
1232 : // -------------------------------------------------------------------
1233 : // Analysis
1234 :
1235 : void
1236 0 : Analysis::EnsureAnalyzed(RegExpNode* that)
1237 : {
1238 3607 : if (!CheckRecursionLimit(cx)) {
1239 0 : failASCII("Stack overflow");
1240 : return;
1241 : }
1242 :
1243 0 : if (that->info()->been_analyzed || that->info()->being_analyzed)
1244 : return;
1245 0 : that->info()->being_analyzed = true;
1246 0 : that->Accept(this);
1247 2793 : that->info()->being_analyzed = false;
1248 2793 : that->info()->been_analyzed = true;
1249 : }
1250 :
1251 : void
1252 150 : Analysis::VisitEnd(EndNode* that)
1253 : {
1254 : // nothing to do
1255 150 : }
1256 :
1257 : void
1258 0 : Analysis::VisitText(TextNode* that)
1259 : {
1260 0 : if (ignore_case_)
1261 0 : that->MakeCaseIndependent(is_latin1_, unicode_);
1262 0 : EnsureAnalyzed(that->on_success());
1263 1078 : if (!has_failed()) {
1264 0 : that->CalculateOffsets();
1265 : }
1266 1078 : }
1267 :
1268 : void
1269 0 : Analysis::VisitAction(ActionNode* that)
1270 : {
1271 1026 : RegExpNode* target = that->on_success();
1272 0 : EnsureAnalyzed(target);
1273 :
1274 1026 : if (!has_failed()) {
1275 : // If the next node is interested in what it follows then this node
1276 : // has to be interested too so it can pass the information on.
1277 0 : that->info()->AddFromFollowing(target->info());
1278 : }
1279 1026 : }
1280 :
1281 : void
1282 0 : Analysis::VisitChoice(ChoiceNode* that)
1283 : {
1284 0 : NodeInfo* info = that->info();
1285 :
1286 0 : for (size_t i = 0; i < that->alternatives().length(); i++) {
1287 0 : RegExpNode* node = that->alternatives()[i].node();
1288 796 : EnsureAnalyzed(node);
1289 796 : if (has_failed()) return;
1290 :
1291 : // Anything the following nodes need to know has to be known by
1292 : // this node also, so it can pass it on.
1293 1592 : info->AddFromFollowing(node->info());
1294 : }
1295 : }
1296 :
1297 : void
1298 0 : Analysis::VisitLoopChoice(LoopChoiceNode* that)
1299 : {
1300 0 : NodeInfo* info = that->info();
1301 0 : for (size_t i = 0; i < that->alternatives().length(); i++) {
1302 0 : RegExpNode* node = that->alternatives()[i].node();
1303 0 : if (node != that->loop_node()) {
1304 0 : EnsureAnalyzed(node);
1305 165 : if (has_failed()) return;
1306 165 : info->AddFromFollowing(node->info());
1307 : }
1308 : }
1309 :
1310 : // Check the loop last since it may need the value of this node
1311 : // to get a correct result.
1312 0 : EnsureAnalyzed(that->loop_node());
1313 165 : if (!has_failed())
1314 165 : info->AddFromFollowing(that->loop_node()->info());
1315 : }
1316 :
1317 : void
1318 0 : Analysis::VisitBackReference(BackReferenceNode* that)
1319 : {
1320 0 : EnsureAnalyzed(that->on_success());
1321 0 : }
1322 :
1323 : void
1324 0 : Analysis::VisitAssertion(AssertionNode* that)
1325 : {
1326 228 : EnsureAnalyzed(that->on_success());
1327 228 : }
1328 :
1329 : // -------------------------------------------------------------------
1330 : // Implementation of the Irregexp regular expression engine.
1331 : //
1332 : // The Irregexp regular expression engine is intended to be a complete
1333 : // implementation of ECMAScript regular expressions. It generates either
1334 : // bytecodes or native code.
1335 :
1336 : // The Irregexp regexp engine is structured in three steps.
1337 : // 1) The parser generates an abstract syntax tree. See RegExpAST.cpp.
1338 : // 2) From the AST a node network is created. The nodes are all
1339 : // subclasses of RegExpNode. The nodes represent states when
1340 : // executing a regular expression. Several optimizations are
1341 : // performed on the node network.
1342 : // 3) From the nodes we generate either byte codes or native code
1343 : // that can actually execute the regular expression (perform
1344 : // the search). The code generation step is described in more
1345 : // detail below.
1346 :
1347 : // Code generation.
1348 : //
1349 : // The nodes are divided into four main categories.
1350 : // * Choice nodes
1351 : // These represent places where the regular expression can
1352 : // match in more than one way. For example on entry to an
1353 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
1354 : // * Action nodes
1355 : // These represent places where some action should be
1356 : // performed. Examples include recording the current position
1357 : // in the input string to a register (in order to implement
1358 : // captures) or other actions on register for example in order
1359 : // to implement the counters needed for {} repetitions.
1360 : // * Matching nodes
1361 : // These attempt to match some element part of the input string.
1362 : // Examples of elements include character classes, plain strings
1363 : // or back references.
1364 : // * End nodes
1365 : // These are used to implement the actions required on finding
1366 : // a successful match or failing to find a match.
1367 : //
1368 : // The code generated (whether as byte codes or native code) maintains
1369 : // some state as it runs. This consists of the following elements:
1370 : //
1371 : // * The capture registers. Used for string captures.
1372 : // * Other registers. Used for counters etc.
1373 : // * The current position.
1374 : // * The stack of backtracking information. Used when a matching node
1375 : // fails to find a match and needs to try an alternative.
1376 : //
1377 : // Conceptual regular expression execution model:
1378 : //
1379 : // There is a simple conceptual model of regular expression execution
1380 : // which will be presented first. The actual code generated is a more
1381 : // efficient simulation of the simple conceptual model:
1382 : //
1383 : // * Choice nodes are implemented as follows:
1384 : // For each choice except the last {
1385 : // push current position
1386 : // push backtrack code location
1387 : // <generate code to test for choice>
1388 : // backtrack code location:
1389 : // pop current position
1390 : // }
1391 : // <generate code to test for last choice>
1392 : //
1393 : // * Actions nodes are generated as follows
1394 : // <push affected registers on backtrack stack>
1395 : // <generate code to perform action>
1396 : // push backtrack code location
1397 : // <generate code to test for following nodes>
1398 : // backtrack code location:
1399 : // <pop affected registers to restore their state>
1400 : // <pop backtrack location from stack and go to it>
1401 : //
1402 : // * Matching nodes are generated as follows:
1403 : // if input string matches at current position
1404 : // update current position
1405 : // <generate code to test for following nodes>
1406 : // else
1407 : // <pop backtrack location from stack and go to it>
1408 : //
1409 : // Thus it can be seen that the current position is saved and restored
1410 : // by the choice nodes, whereas the registers are saved and restored by
1411 : // by the action nodes that manipulate them.
1412 : //
1413 : // The other interesting aspect of this model is that nodes are generated
1414 : // at the point where they are needed by a recursive call to Emit(). If
1415 : // the node has already been code generated then the Emit() call will
1416 : // generate a jump to the previously generated code instead. In order to
1417 : // limit recursion it is possible for the Emit() function to put the node
1418 : // on a work list for later generation and instead generate a jump. The
1419 : // destination of the jump is resolved later when the code is generated.
1420 : //
1421 : // Actual regular expression code generation.
1422 : //
1423 : // Code generation is actually more complicated than the above. In order
1424 : // to improve the efficiency of the generated code some optimizations are
1425 : // performed
1426 : //
1427 : // * Choice nodes have 1-character lookahead.
1428 : // A choice node looks at the following character and eliminates some of
1429 : // the choices immediately based on that character. This is not yet
1430 : // implemented.
1431 : // * Simple greedy loops store reduced backtracking information.
1432 : // A quantifier like /.*foo/m will greedily match the whole input. It will
1433 : // then need to backtrack to a point where it can match "foo". The naive
1434 : // implementation of this would push each character position onto the
1435 : // backtracking stack, then pop them off one by one. This would use space
1436 : // proportional to the length of the input string. However since the "."
1437 : // can only match in one way and always has a constant length (in this case
1438 : // of 1) it suffices to store the current position on the top of the stack
1439 : // once. Matching now becomes merely incrementing the current position and
1440 : // backtracking becomes decrementing the current position and checking the
1441 : // result against the stored current position. This is faster and saves
1442 : // space.
1443 : // * The current state is virtualized.
1444 : // This is used to defer expensive operations until it is clear that they
1445 : // are needed and to generate code for a node more than once, allowing
1446 : // specialized an efficient versions of the code to be created. This is
1447 : // explained in the section below.
1448 : //
1449 : // Execution state virtualization.
1450 : //
1451 : // Instead of emitting code, nodes that manipulate the state can record their
1452 : // manipulation in an object called the Trace. The Trace object can record a
1453 : // current position offset, an optional backtrack code location on the top of
1454 : // the virtualized backtrack stack and some register changes. When a node is
1455 : // to be emitted it can flush the Trace or update it. Flushing the Trace
1456 : // will emit code to bring the actual state into line with the virtual state.
1457 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
1458 : // registers). Postponing work can save time when executing the regular
1459 : // expression since it may be found that the work never has to be done as a
1460 : // failure to match can occur. In addition it is much faster to jump to a
1461 : // known backtrack code location than it is to pop an unknown backtrack
1462 : // location from the stack and jump there.
1463 : //
1464 : // The virtual state found in the Trace affects code generation. For example
1465 : // the virtual state contains the difference between the actual current
1466 : // position and the virtual current position, and matching code needs to use
1467 : // this offset to attempt a match in the correct location of the input
1468 : // string. Therefore code generated for a non-trivial trace is specialized
1469 : // to that trace. The code generator therefore has the ability to generate
1470 : // code for each node several times. In order to limit the size of the
1471 : // generated code there is an arbitrary limit on how many specialized sets of
1472 : // code may be generated for a given node. If the limit is reached, the
1473 : // trace is flushed and a generic version of the code for a node is emitted.
1474 : // This is subsequently used for that node. The code emitted for non-generic
1475 : // trace is not recorded in the node and so it cannot currently be reused in
1476 : // the event that code generation is requested for an identical trace.
1477 :
1478 : /* static */ TextElement
1479 0 : TextElement::Atom(RegExpAtom* atom)
1480 : {
1481 922 : return TextElement(ATOM, atom);
1482 : }
1483 :
1484 : /* static */ TextElement
1485 0 : TextElement::CharClass(RegExpCharacterClass* char_class)
1486 : {
1487 273 : return TextElement(CHAR_CLASS, char_class);
1488 : }
1489 :
1490 : int
1491 0 : TextElement::length() const
1492 : {
1493 0 : switch (text_type()) {
1494 : case ATOM:
1495 10034 : return atom()->length();
1496 : case CHAR_CLASS:
1497 : return 1;
1498 : }
1499 0 : MOZ_CRASH("Bad text type");
1500 : }
1501 :
1502 : class FrequencyCollator
1503 : {
1504 : public:
1505 0 : FrequencyCollator() : total_samples_(0) {
1506 19221 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
1507 19072 : frequencies_[i] = CharacterFrequency(i);
1508 : }
1509 : }
1510 :
1511 : void CountCharacter(int character) {
1512 0 : int index = (character & RegExpMacroAssembler::kTableMask);
1513 8700 : frequencies_[index].Increment();
1514 4350 : total_samples_++;
1515 : }
1516 :
1517 : // Does not measure in percent, but rather per-128 (the table size from the
1518 : // regexp macro assembler).
1519 0 : int Frequency(int in_character) {
1520 1068 : MOZ_ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
1521 0 : if (total_samples_ < 1) return 1; // Division by zero.
1522 : int freq_in_per128 =
1523 1065 : (frequencies_[in_character].counter() * 128) / total_samples_;
1524 1065 : return freq_in_per128;
1525 : }
1526 :
1527 : private:
1528 : class CharacterFrequency {
1529 : public:
1530 19072 : CharacterFrequency() : counter_(0), character_(-1) { }
1531 : explicit CharacterFrequency(int character)
1532 : : counter_(0), character_(character)
1533 : {}
1534 :
1535 4350 : void Increment() { counter_++; }
1536 : int counter() { return counter_; }
1537 : int character() { return character_; }
1538 :
1539 : private:
1540 : int counter_;
1541 : int character_;
1542 : };
1543 :
1544 : private:
1545 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1546 : int total_samples_;
1547 : };
1548 :
1549 149 : class irregexp::RegExpCompiler
1550 : {
1551 : public:
1552 : RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
1553 : bool ignore_case, bool is_latin1, bool match_only, bool unicode);
1554 :
1555 : int AllocateRegister() {
1556 53 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1557 0 : reg_exp_too_big_ = true;
1558 : return next_register_;
1559 : }
1560 53 : return next_register_++;
1561 : }
1562 :
1563 : RegExpCode Assemble(JSContext* cx,
1564 : RegExpMacroAssembler* assembler,
1565 : RegExpNode* start,
1566 : int capture_count);
1567 :
1568 0 : inline void AddWork(RegExpNode* node) {
1569 0 : AutoEnterOOMUnsafeRegion oomUnsafe;
1570 0 : if (!work_list_.append(node))
1571 0 : oomUnsafe.crash("AddWork");
1572 0 : }
1573 :
1574 : static const int kImplementationOffset = 0;
1575 : static const int kNumberOfRegistersOffset = 0;
1576 : static const int kCodeOffset = 1;
1577 :
1578 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1579 : EndNode* accept() { return accept_; }
1580 :
1581 : static const int kMaxRecursion = 100;
1582 : inline int recursion_depth() { return recursion_depth_; }
1583 3739 : inline void IncrementRecursionDepth() { recursion_depth_++; }
1584 0 : inline void DecrementRecursionDepth() { recursion_depth_--; }
1585 :
1586 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1587 : inline bool isRegExpTooBig() { return reg_exp_too_big_; }
1588 :
1589 : inline bool ignore_case() { return ignore_case_; }
1590 : inline bool latin1() { return latin1_; }
1591 : inline bool unicode() { return unicode_; }
1592 1068 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1593 :
1594 : int current_expansion_factor() { return current_expansion_factor_; }
1595 : void set_current_expansion_factor(int value) {
1596 442 : current_expansion_factor_ = value;
1597 : }
1598 :
1599 : JSContext* cx() const { return cx_; }
1600 : LifoAlloc* alloc() const { return alloc_; }
1601 :
1602 : static const int kNoRegister = -1;
1603 :
1604 : bool CheckOverRecursed();
1605 :
1606 : private:
1607 : EndNode* accept_;
1608 : int next_register_;
1609 : Vector<RegExpNode*, 4, SystemAllocPolicy> work_list_;
1610 : int recursion_depth_;
1611 : RegExpMacroAssembler* macro_assembler_;
1612 : bool ignore_case_;
1613 : bool latin1_;
1614 : bool match_only_;
1615 : bool unicode_;
1616 : bool reg_exp_too_big_;
1617 : int current_expansion_factor_;
1618 : FrequencyCollator frequency_collator_;
1619 : JSContext* cx_;
1620 : LifoAlloc* alloc_;
1621 : };
1622 :
1623 : class RecursionCheck
1624 : {
1625 : public:
1626 3739 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1627 0 : compiler->IncrementRecursionDepth();
1628 : }
1629 7478 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1630 :
1631 : private:
1632 : RegExpCompiler* compiler_;
1633 : };
1634 :
1635 : static inline bool
1636 : IsLatin1Equivalent(char16_t c, RegExpCompiler* compiler)
1637 : {
1638 20550 : if (c <= kMaxOneByteCharCode)
1639 : return true;
1640 :
1641 0 : if (!compiler->ignore_case())
1642 : return false;
1643 :
1644 0 : char16_t converted = ConvertNonLatin1ToLatin1(c, compiler->unicode());
1645 :
1646 0 : return converted != 0 && converted <= kMaxOneByteCharCode;
1647 : }
1648 :
1649 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1650 : // a fixed array or a null handle depending on whether it succeeded.
1651 0 : RegExpCompiler::RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
1652 149 : bool ignore_case, bool latin1, bool match_only, bool unicode)
1653 149 : : next_register_(2 * (capture_count + 1)),
1654 : recursion_depth_(0),
1655 : ignore_case_(ignore_case),
1656 : latin1_(latin1),
1657 : match_only_(match_only),
1658 : unicode_(unicode),
1659 : reg_exp_too_big_(false),
1660 : current_expansion_factor_(1),
1661 : frequency_collator_(),
1662 : cx_(cx),
1663 0 : alloc_(alloc)
1664 : {
1665 0 : accept_ = alloc->newInfallible<EndNode>(alloc, EndNode::ACCEPT);
1666 149 : MOZ_ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1667 149 : }
1668 :
1669 : RegExpCode
1670 149 : RegExpCompiler::Assemble(JSContext* cx,
1671 : RegExpMacroAssembler* assembler,
1672 : RegExpNode* start,
1673 : int capture_count)
1674 : {
1675 149 : macro_assembler_ = assembler;
1676 298 : macro_assembler_->set_slow_safe(false);
1677 :
1678 : // The LifoAlloc used by the regexp compiler is infallible and is currently
1679 : // expected to crash on OOM. Thus we have to disable the assertions made to
1680 : // prevent us from allocating any new chunk in the LifoAlloc. This is needed
1681 : // because the jit::MacroAssembler turns these assertions on by default.
1682 0 : LifoAlloc::AutoFallibleScope fallibleAllocator(alloc());
1683 :
1684 0 : jit::Label fail;
1685 0 : macro_assembler_->PushBacktrack(&fail);
1686 0 : Trace new_trace;
1687 0 : start->Emit(this, &new_trace);
1688 149 : macro_assembler_->BindBacktrack(&fail);
1689 0 : macro_assembler_->Fail();
1690 :
1691 149 : while (!work_list_.empty())
1692 0 : work_list_.popCopy()->Emit(this, &new_trace);
1693 :
1694 0 : RegExpCode code = macro_assembler_->GenerateCode(cx, match_only_);
1695 149 : if (code.empty())
1696 0 : return RegExpCode();
1697 :
1698 0 : if (reg_exp_too_big_) {
1699 0 : code.destroy();
1700 0 : js::gc::AutoSuppressGC suppress(cx);
1701 0 : JS_ReportErrorASCII(cx, "regexp too big");
1702 0 : return RegExpCode();
1703 : }
1704 :
1705 149 : return code;
1706 : }
1707 :
1708 : template <typename CharT>
1709 : static void
1710 7 : SampleChars(FrequencyCollator* collator, const CharT* chars, size_t length)
1711 : {
1712 : // Sample some characters from the middle of the string.
1713 : static const int kSampleSize = 128;
1714 :
1715 0 : int chars_sampled = 0;
1716 0 : int half_way = (int(length) - kSampleSize) / 2;
1717 4499 : for (size_t i = Max(0, half_way);
1718 4499 : i < length && chars_sampled < kSampleSize;
1719 : i++, chars_sampled++)
1720 : {
1721 0 : collator->CountCharacter(chars[i]);
1722 : }
1723 7 : }
1724 :
1725 : static bool
1726 : IsNativeRegExpEnabled(JSContext* cx)
1727 : {
1728 : #ifdef JS_CODEGEN_NONE
1729 : return false;
1730 : #else
1731 298 : return cx->options().nativeRegExp();
1732 : #endif
1733 : }
1734 :
1735 : RegExpCode
1736 149 : irregexp::CompilePattern(JSContext* cx, HandleRegExpShared shared, RegExpCompileData* data,
1737 : HandleLinearString sample, bool is_global, bool ignore_case,
1738 : bool is_latin1, bool match_only, bool force_bytecode, bool sticky,
1739 : bool unicode, RegExpShared::JitCodeTables& tables)
1740 : {
1741 1 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
1742 0 : JS_ReportErrorASCII(cx, "regexp too big");
1743 0 : return RegExpCode();
1744 : }
1745 :
1746 0 : LifoAlloc& alloc = cx->tempLifoAlloc();
1747 : RegExpCompiler compiler(cx, &alloc, data->capture_count, ignore_case, is_latin1, match_only,
1748 149 : unicode);
1749 :
1750 : // Sample some characters from the middle of the string.
1751 0 : if (sample->hasLatin1Chars()) {
1752 284 : JS::AutoCheckCannotGC nogc;
1753 0 : SampleChars(compiler.frequency_collator(), sample->latin1Chars(nogc), sample->length());
1754 : } else {
1755 14 : JS::AutoCheckCannotGC nogc;
1756 28 : SampleChars(compiler.frequency_collator(), sample->twoByteChars(nogc), sample->length());
1757 : }
1758 :
1759 : // Wrap the body of the regexp in capture #0.
1760 149 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
1761 : 0,
1762 : &compiler,
1763 0 : compiler.accept());
1764 0 : RegExpNode* node = captured_body;
1765 0 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
1766 0 : bool is_start_anchored = sticky || data->tree->IsAnchoredAtStart();
1767 149 : int max_length = data->tree->max_match();
1768 149 : if (!is_start_anchored) {
1769 : // Add a .*? at the beginning, outside the body capture, unless
1770 : // this expression is anchored at the beginning.
1771 : RegExpNode* loop_node =
1772 116 : RegExpQuantifier::ToNode(0,
1773 : RegExpTree::kInfinity,
1774 : false,
1775 116 : alloc.newInfallible<RegExpCharacterClass>('*'),
1776 : &compiler,
1777 : captured_body,
1778 0 : data->contains_anchor);
1779 :
1780 58 : if (data->contains_anchor) {
1781 : // Unroll loop once, to take care of the case that might start
1782 : // at the start of input.
1783 0 : ChoiceNode* first_step_node = alloc.newInfallible<ChoiceNode>(&alloc, 2);
1784 : RegExpNode* char_class =
1785 0 : alloc.newInfallible<TextNode>(alloc.newInfallible<RegExpCharacterClass>('*'), loop_node);
1786 0 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
1787 38 : first_step_node->AddAlternative(GuardedAlternative(char_class));
1788 19 : node = first_step_node;
1789 : } else {
1790 : node = loop_node;
1791 : }
1792 : }
1793 :
1794 0 : if (compiler.isRegExpTooBig()) {
1795 : // This might erase the over-recurse error, if any.
1796 0 : JS_ReportErrorASCII(cx, "regexp too big");
1797 0 : return RegExpCode();
1798 : }
1799 :
1800 149 : if (is_latin1) {
1801 142 : node = node->FilterLATIN1(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
1802 : // Do it again to propagate the new nodes to places where they were not
1803 : // put because they had not been calculated yet.
1804 142 : if (node != nullptr) {
1805 142 : node = node->FilterLATIN1(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
1806 : }
1807 : }
1808 :
1809 149 : if (node == nullptr)
1810 0 : node = alloc.newInfallible<EndNode>(&alloc, EndNode::BACKTRACK);
1811 :
1812 0 : Analysis analysis(cx, ignore_case, is_latin1, unicode);
1813 0 : analysis.EnsureAnalyzed(node);
1814 0 : if (analysis.has_failed()) {
1815 0 : JS_ReportErrorASCII(cx, "%s", analysis.errorMessage());
1816 0 : return RegExpCode();
1817 : }
1818 :
1819 : // We should not GC when we have a jit::MacroAssembler on the stack. Check
1820 : // this here because the static analysis does not understand the
1821 : // Maybe<NativeRegExpMacroAssembler> below.
1822 0 : JS::AutoCheckCannotGC nogc(cx);
1823 :
1824 0 : Maybe<jit::JitContext> ctx;
1825 298 : Maybe<NativeRegExpMacroAssembler> native_assembler;
1826 298 : Maybe<InterpretedRegExpMacroAssembler> interpreted_assembler;
1827 :
1828 : RegExpMacroAssembler* assembler;
1829 0 : if (IsNativeRegExpEnabled(cx) &&
1830 0 : !force_bytecode &&
1831 447 : jit::CanLikelyAllocateMoreExecutableMemory() &&
1832 298 : shared->getSource()->length() < 32 * 1024)
1833 : {
1834 : NativeRegExpMacroAssembler::Mode mode =
1835 149 : is_latin1 ? NativeRegExpMacroAssembler::LATIN1
1836 0 : : NativeRegExpMacroAssembler::CHAR16;
1837 :
1838 0 : ctx.emplace(cx, (jit::TempAllocator*) nullptr);
1839 149 : native_assembler.emplace(cx, &alloc, mode, (data->capture_count + 1) * 2, tables);
1840 0 : assembler = native_assembler.ptr();
1841 : } else {
1842 0 : interpreted_assembler.emplace(cx, &alloc, (data->capture_count + 1) * 2);
1843 0 : assembler = interpreted_assembler.ptr();
1844 : }
1845 :
1846 : // Inserted here, instead of in Assembler, because it depends on information
1847 : // in the AST that isn't replicated in the Node structure.
1848 : static const int kMaxBacksearchLimit = 1024;
1849 149 : if (is_end_anchored &&
1850 0 : !is_start_anchored &&
1851 : max_length < kMaxBacksearchLimit) {
1852 4 : assembler->SetCurrentPositionFromEnd(max_length);
1853 : }
1854 :
1855 149 : if (is_global) {
1856 0 : assembler->set_global_mode((data->tree->min_match() > 0)
1857 : ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
1858 0 : : RegExpMacroAssembler::GLOBAL);
1859 : }
1860 :
1861 149 : return compiler.Assemble(cx, assembler, node, data->capture_count);
1862 : }
1863 :
1864 : template <typename CharT>
1865 : RegExpRunStatus
1866 2017 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const CharT* chars, size_t start,
1867 : size_t length, MatchPairs* matches, size_t* endIndex)
1868 : {
1869 : typedef void (*RegExpCodeSignature)(InputOutputData*);
1870 :
1871 0 : InputOutputData data(chars, chars + length, start, matches, endIndex);
1872 :
1873 2017 : RegExpCodeSignature function = reinterpret_cast<RegExpCodeSignature>(codeBlock->raw());
1874 :
1875 : {
1876 4034 : JS::AutoSuppressGCAnalysis nogc;
1877 2017 : CALL_GENERATED_1(function, &data);
1878 : }
1879 :
1880 2017 : return (RegExpRunStatus) data.result;
1881 : }
1882 :
1883 : template RegExpRunStatus
1884 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const Latin1Char* chars, size_t start,
1885 : size_t length, MatchPairs* matches, size_t* endIndex);
1886 :
1887 : template RegExpRunStatus
1888 : irregexp::ExecuteCode(JSContext* cx, jit::JitCode* codeBlock, const char16_t* chars, size_t start,
1889 : size_t length, MatchPairs* matches, size_t* endIndex);
1890 :
1891 : // -------------------------------------------------------------------
1892 : // Tree to graph conversion
1893 :
1894 : RegExpNode*
1895 844 : RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1896 : {
1897 : TextElementVector* elms =
1898 0 : compiler->alloc()->newInfallible<TextElementVector>(*compiler->alloc());
1899 844 : elms->append(TextElement::Atom(this));
1900 844 : return compiler->alloc()->newInfallible<TextNode>(elms, on_success);
1901 : }
1902 :
1903 : RegExpNode*
1904 0 : RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1905 : {
1906 16 : return compiler->alloc()->newInfallible<TextNode>(&elements_, on_success);
1907 : }
1908 :
1909 : RegExpNode*
1910 0 : RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1911 : {
1912 199 : return compiler->alloc()->newInfallible<TextNode>(this, on_success);
1913 : }
1914 :
1915 : RegExpNode*
1916 0 : RegExpDisjunction::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1917 : {
1918 112 : if (!compiler->CheckOverRecursed())
1919 : return on_success;
1920 :
1921 0 : const RegExpTreeVector& alternatives = this->alternatives();
1922 0 : size_t length = alternatives.length();
1923 0 : ChoiceNode* result = compiler->alloc()->newInfallible<ChoiceNode>(compiler->alloc(), length);
1924 0 : for (size_t i = 0; i < length && !compiler->isRegExpTooBig(); i++) {
1925 1456 : GuardedAlternative alternative(alternatives[i]->ToNode(compiler, on_success));
1926 728 : result->AddAlternative(alternative);
1927 : }
1928 : return result;
1929 : }
1930 :
1931 : RegExpNode*
1932 0 : RegExpQuantifier::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
1933 : {
1934 0 : return ToNode(min(),
1935 : max(),
1936 121 : is_greedy(),
1937 : body(),
1938 : compiler,
1939 121 : on_success);
1940 : }
1941 :
1942 : // Scoped object to keep track of how much we unroll quantifier loops in the
1943 : // regexp graph generator.
1944 : class RegExpExpansionLimiter
1945 : {
1946 : public:
1947 : static const int kMaxExpansionFactor = 6;
1948 0 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1949 0 : : compiler_(compiler),
1950 221 : saved_expansion_factor_(compiler->current_expansion_factor()),
1951 0 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor)
1952 : {
1953 0 : MOZ_ASSERT(factor > 0);
1954 221 : if (ok_to_expand_) {
1955 0 : if (factor > kMaxExpansionFactor) {
1956 : // Avoid integer overflow of the current expansion factor.
1957 7 : ok_to_expand_ = false;
1958 0 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1959 : } else {
1960 0 : int new_factor = saved_expansion_factor_ * factor;
1961 214 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1962 214 : compiler->set_current_expansion_factor(new_factor);
1963 : }
1964 : }
1965 221 : }
1966 :
1967 : ~RegExpExpansionLimiter() {
1968 442 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
1969 : }
1970 :
1971 : bool ok_to_expand() { return ok_to_expand_; }
1972 :
1973 : private:
1974 : RegExpCompiler* compiler_;
1975 : int saved_expansion_factor_;
1976 : bool ok_to_expand_;
1977 : };
1978 :
1979 : /* static */ RegExpNode*
1980 222 : RegExpQuantifier::ToNode(int min,
1981 : int max,
1982 : bool is_greedy,
1983 : RegExpTree* body,
1984 : RegExpCompiler* compiler,
1985 : RegExpNode* on_success,
1986 : bool not_at_start /* = false */)
1987 : {
1988 : // x{f, t} becomes this:
1989 : //
1990 : // (r++)<-.
1991 : // | `
1992 : // | (x)
1993 : // v ^
1994 : // (r=0)-->(?)---/ [if r < t]
1995 : // |
1996 : // [if r >= f] \----> ...
1997 : //
1998 :
1999 : // 15.10.2.5 RepeatMatcher algorithm.
2000 : // The parser has already eliminated the case where max is 0. In the case
2001 : // where max_match is zero the parser has removed the quantifier if min was
2002 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
2003 :
2004 : // If we know that we cannot match zero length then things are a little
2005 : // simpler since we don't need to make the special zero length match check
2006 : // from step 2.1. If the min and max are small we can unroll a little in
2007 : // this case.
2008 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
2009 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
2010 :
2011 222 : if (max == 0)
2012 : return on_success; // This can happen due to recursion.
2013 :
2014 222 : if (!compiler->CheckOverRecursed())
2015 : return on_success;
2016 :
2017 0 : bool body_can_be_empty = (body->min_match() == 0);
2018 0 : int body_start_reg = RegExpCompiler::kNoRegister;
2019 0 : Interval capture_registers = body->CaptureRegisters();
2020 222 : bool needs_capture_clearing = !capture_registers.is_empty();
2021 0 : LifoAlloc* alloc = compiler->alloc();
2022 :
2023 0 : if (body_can_be_empty) {
2024 0 : body_start_reg = compiler->AllocateRegister();
2025 222 : } else if (!needs_capture_clearing) {
2026 : // Only unroll if there are no captures and the body can't be
2027 : // empty.
2028 : {
2029 0 : RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
2030 207 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
2031 43 : int new_max = (max == kInfinity) ? max : max - min;
2032 : // Recurse once to get the loop or optional matches after the fixed
2033 : // ones.
2034 43 : RegExpNode* answer = ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
2035 : // Unroll the forced matches from 0 to min. This can cause chains of
2036 : // TextNodes (which the parser does not generate). These should be
2037 : // combined if it turns out they hinder good code generation.
2038 0 : for (int i = 0; i < min; i++)
2039 43 : answer = body->ToNode(compiler, answer);
2040 86 : return answer;
2041 : }
2042 : }
2043 0 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
2044 0 : MOZ_ASSERT(max > 0); // Due to the 'if' above.
2045 14 : RegExpExpansionLimiter limiter(compiler, max);
2046 14 : if (limiter.ok_to_expand()) {
2047 : // Unroll the optional matches up to max.
2048 : RegExpNode* answer = on_success;
2049 0 : for (int i = 0; i < max; i++) {
2050 0 : ChoiceNode* alternation = alloc->newInfallible<ChoiceNode>(alloc, 2);
2051 0 : if (is_greedy) {
2052 28 : alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer)));
2053 0 : alternation->AddAlternative(GuardedAlternative(on_success));
2054 : } else {
2055 0 : alternation->AddAlternative(GuardedAlternative(on_success));
2056 0 : alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer)));
2057 : }
2058 14 : answer = alternation;
2059 0 : if (not_at_start) alternation->set_not_at_start();
2060 : }
2061 28 : return answer;
2062 : }
2063 : }
2064 : }
2065 0 : bool has_min = min > 0;
2066 165 : bool has_max = max < RegExpTree::kInfinity;
2067 0 : bool needs_counter = has_min || has_max;
2068 : int reg_ctr = needs_counter
2069 0 : ? compiler->AllocateRegister()
2070 0 : : RegExpCompiler::kNoRegister;
2071 0 : LoopChoiceNode* center = alloc->newInfallible<LoopChoiceNode>(alloc, body->min_match() == 0);
2072 165 : if (not_at_start)
2073 0 : center->set_not_at_start();
2074 : RegExpNode* loop_return = needs_counter
2075 0 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
2076 165 : : static_cast<RegExpNode*>(center);
2077 165 : if (body_can_be_empty) {
2078 : // If the body can be empty we need to check if it was and then
2079 : // backtrack.
2080 0 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
2081 : reg_ctr,
2082 : min,
2083 0 : loop_return);
2084 : }
2085 165 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
2086 165 : if (body_can_be_empty) {
2087 : // If the body can be empty we need to store the start position
2088 : // so we can bail out if it was empty.
2089 0 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
2090 : }
2091 0 : if (needs_capture_clearing) {
2092 : // Before entering the body of this loop we need to clear captures.
2093 0 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
2094 : }
2095 0 : GuardedAlternative body_alt(body_node);
2096 0 : if (has_max) {
2097 60 : Guard* body_guard = alloc->newInfallible<Guard>(reg_ctr, Guard::LT, max);
2098 0 : body_alt.AddGuard(alloc, body_guard);
2099 : }
2100 0 : GuardedAlternative rest_alt(on_success);
2101 0 : if (has_min) {
2102 34 : Guard* rest_guard = alloc->newInfallible<Guard>(reg_ctr, Guard::GEQ, min);
2103 0 : rest_alt.AddGuard(alloc, rest_guard);
2104 : }
2105 0 : if (is_greedy) {
2106 101 : center->AddLoopAlternative(body_alt);
2107 0 : center->AddContinueAlternative(rest_alt);
2108 : } else {
2109 64 : center->AddContinueAlternative(rest_alt);
2110 0 : center->AddLoopAlternative(body_alt);
2111 : }
2112 165 : if (needs_counter)
2113 62 : return ActionNode::SetRegister(reg_ctr, 0, center);
2114 : return center;
2115 : }
2116 :
2117 : RegExpNode*
2118 228 : RegExpAssertion::ToNode(RegExpCompiler* compiler,
2119 : RegExpNode* on_success)
2120 : {
2121 228 : NodeInfo info;
2122 0 : LifoAlloc* alloc = compiler->alloc();
2123 :
2124 0 : switch (assertion_type()) {
2125 : case START_OF_LINE:
2126 0 : return AssertionNode::AfterNewline(on_success);
2127 : case START_OF_INPUT:
2128 0 : return AssertionNode::AtStart(on_success);
2129 : case BOUNDARY:
2130 0 : return AssertionNode::AtBoundary(on_success);
2131 : case NON_BOUNDARY:
2132 0 : return AssertionNode::AtNonBoundary(on_success);
2133 : case END_OF_INPUT:
2134 114 : return AssertionNode::AtEnd(on_success);
2135 : case END_OF_LINE: {
2136 : // Compile $ in multiline regexps as an alternation with a positive
2137 : // lookahead in one side and an end-of-input on the other side.
2138 : // We need two registers for the lookahead.
2139 0 : int stack_pointer_register = compiler->AllocateRegister();
2140 0 : int position_register = compiler->AllocateRegister();
2141 : // The ChoiceNode to distinguish between a newline and end-of-input.
2142 0 : ChoiceNode* result = alloc->newInfallible<ChoiceNode>(alloc, 2);
2143 : // Create a newline atom.
2144 0 : CharacterRangeVector* newline_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
2145 0 : CharacterRange::AddClassEscape(alloc, 'n', newline_ranges);
2146 0 : RegExpCharacterClass* newline_atom = alloc->newInfallible<RegExpCharacterClass>('n');
2147 : TextNode* newline_matcher =
2148 0 : alloc->newInfallible<TextNode>(newline_atom,
2149 0 : ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
2150 : position_register,
2151 : 0, // No captures inside.
2152 : -1, // Ignored if no captures.
2153 0 : on_success));
2154 : // Create an end-of-input matcher.
2155 : RegExpNode* end_of_line =
2156 0 : ActionNode::BeginSubmatch(stack_pointer_register, position_register, newline_matcher);
2157 :
2158 : // Add the two alternatives to the ChoiceNode.
2159 0 : GuardedAlternative eol_alternative(end_of_line);
2160 0 : result->AddAlternative(eol_alternative);
2161 0 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
2162 0 : result->AddAlternative(end_alternative);
2163 : return result;
2164 : }
2165 : case NOT_AFTER_LEAD_SURROGATE:
2166 0 : return AssertionNode::NotAfterLeadSurrogate(on_success);
2167 : case NOT_IN_SURROGATE_PAIR:
2168 0 : return AssertionNode::NotInSurrogatePair(on_success);
2169 : default:
2170 0 : MOZ_CRASH("Bad assertion type");
2171 : }
2172 : return on_success;
2173 : }
2174 :
2175 : RegExpNode*
2176 0 : RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2177 : {
2178 0 : return compiler->alloc()->newInfallible<BackReferenceNode>(RegExpCapture::StartRegister(index()),
2179 0 : RegExpCapture::EndRegister(index()),
2180 0 : on_success);
2181 : }
2182 :
2183 : RegExpNode*
2184 0 : RegExpEmpty::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2185 : {
2186 0 : return on_success;
2187 : }
2188 :
2189 : RegExpNode*
2190 0 : RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2191 : {
2192 22 : int stack_pointer_register = compiler->AllocateRegister();
2193 0 : int position_register = compiler->AllocateRegister();
2194 :
2195 0 : const int registers_per_capture = 2;
2196 11 : const int register_of_first_capture = 2;
2197 0 : int register_count = capture_count_ * registers_per_capture;
2198 : int register_start =
2199 0 : register_of_first_capture + capture_from_ * registers_per_capture;
2200 :
2201 11 : if (!compiler->CheckOverRecursed())
2202 : return on_success;
2203 :
2204 0 : if (is_positive()) {
2205 : RegExpNode* bodyNode =
2206 20 : body()->ToNode(compiler,
2207 10 : ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
2208 : position_register,
2209 : register_count,
2210 : register_start,
2211 20 : on_success));
2212 0 : return ActionNode::BeginSubmatch(stack_pointer_register,
2213 : position_register,
2214 10 : bodyNode);
2215 : }
2216 :
2217 : // We use a ChoiceNode for a negative lookahead because it has most of
2218 : // the characteristics we need. It has the body of the lookahead as its
2219 : // first alternative and the expression after the lookahead of the second
2220 : // alternative. If the first alternative succeeds then the
2221 : // NegativeSubmatchSuccess will unwind the stack including everything the
2222 : // choice node set up and backtrack. If the first alternative fails then
2223 : // the second alternative is tried, which is exactly the desired result
2224 : // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
2225 : // ChoiceNode that knows to ignore the first exit when calculating quick
2226 : // checks.
2227 1 : LifoAlloc* alloc = compiler->alloc();
2228 :
2229 : RegExpNode* success =
2230 : alloc->newInfallible<NegativeSubmatchSuccess>(alloc,
2231 : stack_pointer_register,
2232 : position_register,
2233 : register_count,
2234 1 : register_start);
2235 2 : GuardedAlternative body_alt(body()->ToNode(compiler, success));
2236 :
2237 : ChoiceNode* choice_node =
2238 0 : alloc->newInfallible<NegativeLookaheadChoiceNode>(alloc, body_alt, GuardedAlternative(on_success));
2239 :
2240 0 : return ActionNode::BeginSubmatch(stack_pointer_register,
2241 : position_register,
2242 1 : choice_node);
2243 : }
2244 :
2245 : RegExpNode*
2246 0 : RegExpCapture::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2247 : {
2248 315 : return ToNode(body(), index(), compiler, on_success);
2249 : }
2250 :
2251 : /* static */ RegExpNode*
2252 464 : RegExpCapture::ToNode(RegExpTree* body,
2253 : int index,
2254 : RegExpCompiler* compiler,
2255 : RegExpNode* on_success)
2256 : {
2257 464 : if (!compiler->CheckOverRecursed())
2258 : return on_success;
2259 :
2260 0 : int start_reg = RegExpCapture::StartRegister(index);
2261 0 : int end_reg = RegExpCapture::EndRegister(index);
2262 0 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
2263 464 : RegExpNode* body_node = body->ToNode(compiler, store_end);
2264 464 : return ActionNode::StorePosition(start_reg, true, body_node);
2265 : }
2266 :
2267 : RegExpNode*
2268 0 : RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
2269 : {
2270 196 : if (!compiler->CheckOverRecursed())
2271 : return on_success;
2272 :
2273 0 : const RegExpTreeVector& children = nodes();
2274 0 : RegExpNode* current = on_success;
2275 813 : for (int i = children.length() - 1; i >= 0 && !compiler->isRegExpTooBig(); i--)
2276 1234 : current = children[i]->ToNode(compiler, current);
2277 : return current;
2278 : }
2279 :
2280 : // -------------------------------------------------------------------
2281 : // BoyerMooreLookahead
2282 :
2283 : ContainedInLattice
2284 10480 : irregexp::AddRange(ContainedInLattice containment,
2285 : const int* ranges,
2286 : int ranges_length,
2287 : Interval new_range)
2288 : {
2289 0 : MOZ_ASSERT((ranges_length & 1) == 1);
2290 10480 : MOZ_ASSERT(ranges[ranges_length - 1] == kMaxUtf16CodeUnit + 1);
2291 10480 : if (containment == kLatticeUnknown) return containment;
2292 : bool inside = false;
2293 : int last = 0;
2294 58541 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2295 : // Consider the range from last to ranges[i].
2296 : // We haven't got to the new range yet.
2297 33839 : if (ranges[i] <= new_range.from())
2298 : continue;
2299 :
2300 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
2301 : // inclusive, but the values in ranges are not.
2302 9137 : if (last <= new_range.from() && new_range.to() < ranges[i])
2303 18198 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2304 :
2305 : return kLatticeUnknown;
2306 : }
2307 : return containment;
2308 : }
2309 :
2310 : void
2311 0 : BoyerMoorePositionInfo::Set(int character)
2312 : {
2313 2480 : SetInterval(Interval(character, character));
2314 0 : }
2315 :
2316 : void
2317 0 : BoyerMoorePositionInfo::SetInterval(const Interval& interval)
2318 : {
2319 0 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2320 2620 : if (unicode_ignore_case_)
2321 0 : w_ = AddRange(w_, kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, interval);
2322 : else
2323 0 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2324 0 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2325 0 : surrogate_ =
2326 0 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2327 0 : if (interval.to() - interval.from() >= kMapSize - 1) {
2328 0 : if (map_count_ != kMapSize) {
2329 0 : map_count_ = kMapSize;
2330 1290 : for (int i = 0; i < kMapSize; i++)
2331 2560 : map_[i] = true;
2332 : }
2333 : return;
2334 : }
2335 0 : MOZ_ASSERT(interval.from() <= interval.to());
2336 0 : for (int i = interval.from(); i != interval.to() + 1; i++) {
2337 0 : int mod_character = (i & kMask);
2338 0 : if (!map_[mod_character]) {
2339 1871 : map_count_++;
2340 0 : map_[mod_character] = true;
2341 : }
2342 3781 : if (map_count_ == kMapSize)
2343 : return;
2344 : }
2345 : }
2346 :
2347 : void
2348 0 : BoyerMoorePositionInfo::SetAll()
2349 : {
2350 0 : s_ = w_ = d_ = kLatticeUnknown;
2351 0 : if (map_count_ != kMapSize) {
2352 0 : map_count_ = kMapSize;
2353 3096 : for (int i = 0; i < kMapSize; i++)
2354 0 : map_[i] = true;
2355 : }
2356 0 : }
2357 :
2358 59 : BoyerMooreLookahead::BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler)
2359 0 : : length_(length), compiler_(compiler), bitmaps_(*alloc)
2360 : {
2361 59 : bool unicode_ignore_case = compiler->unicode() && compiler->ignore_case();
2362 0 : max_char_ = MaximumCharacter(compiler->latin1());
2363 :
2364 0 : bitmaps_.reserve(length);
2365 0 : for (size_t i = 0; i < length; i++)
2366 286 : bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc, unicode_ignore_case));
2367 59 : }
2368 :
2369 : // Find the longest range of lookahead that has the fewest number of different
2370 : // characters that can occur at a given position. Since we are optimizing two
2371 : // different parameters at once this is a tradeoff.
2372 0 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2373 58 : int biggest_points = 0;
2374 : // If more than 32 characters out of 128 can occur it is unlikely that we can
2375 : // be lucky enough to step forwards much of the time.
2376 0 : const int kMaxMax = 32;
2377 0 : for (int max_number_of_chars = 4;
2378 232 : max_number_of_chars < kMaxMax;
2379 0 : max_number_of_chars *= 2) {
2380 : biggest_points =
2381 0 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
2382 : }
2383 58 : if (biggest_points == 0) return false;
2384 0 : return true;
2385 : }
2386 :
2387 : // Find the highest-points range between 0 and length_ where the character
2388 : // information is not too vague. 'Too vague' means that there are more than
2389 : // max_number_of_chars that can occur at this position. Calculates the number
2390 : // of points as the product of width-of-the-range and
2391 : // probability-of-finding-one-of-the-characters, where the probability is
2392 : // calculated using the frequency distribution of the sample subject string.
2393 : int
2394 174 : BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, int old_biggest_points,
2395 : int* from, int* to)
2396 : {
2397 0 : int biggest_points = old_biggest_points;
2398 : static const int kSize = RegExpMacroAssembler::kTableSize;
2399 0 : for (int i = 0; i < length_; ) {
2400 837 : while (i < length_ && Count(i) > max_number_of_chars) i++;
2401 217 : if (i == length_) break;
2402 : int remembered_from = i;
2403 : bool union_map[kSize];
2404 0 : for (int j = 0; j < kSize; j++) union_map[j] = false;
2405 0 : while (i < length_ && Count(i) <= max_number_of_chars) {
2406 0 : BoyerMoorePositionInfo* map = bitmaps_[i];
2407 79980 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
2408 620 : i++;
2409 : }
2410 : int frequency = 0;
2411 38550 : for (int j = 0; j < kSize; j++) {
2412 19200 : if (union_map[j]) {
2413 : // Add 1 to the frequency to give a small per-character boost for
2414 : // the cases where our sampling is not good enough and many
2415 : // characters have a frequency of zero. This means the frequency
2416 : // can theoretically be up to 2*kSize though we treat it mostly as
2417 : // a fraction of kSize.
2418 2136 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2419 : }
2420 : }
2421 : // We use the probability of skipping times the distance we are skipping to
2422 : // judge the effectiveness of this. Actually we have a cut-off: By
2423 : // dividing by 2 we switch off the skipping if the probability of skipping
2424 : // is less than 50%. This is because the multibyte mask-and-compare
2425 : // skipping in quickcheck is more likely to do well on this case.
2426 223 : bool in_quickcheck_range = ((i - remembered_from < 4) ||
2427 76 : (compiler_->latin1() ? remembered_from <= 4 : remembered_from <= 2));
2428 : // Called 'probability' but it is only a rough estimate and can actually
2429 : // be outside the 0-kSize range.
2430 0 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2431 0 : int points = (i - remembered_from) * probability;
2432 0 : if (points > biggest_points) {
2433 0 : *from = remembered_from;
2434 53 : *to = i - 1;
2435 53 : biggest_points = points;
2436 : }
2437 : }
2438 174 : return biggest_points;
2439 : }
2440 :
2441 : // Take all the characters that will not prevent a successful match if they
2442 : // occur in the subject string in the range between min_lookahead and
2443 : // max_lookahead (inclusive) measured from the current position. If the
2444 : // character at max_lookahead offset is not one of these characters, then we
2445 : // can safely skip forwards by the number of characters in the range.
2446 35 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
2447 : int max_lookahead,
2448 : uint8_t* boolean_skip_table)
2449 : {
2450 0 : const int kSize = RegExpMacroAssembler::kTableSize;
2451 :
2452 35 : const int kSkipArrayEntry = 0;
2453 0 : const int kDontSkipArrayEntry = 1;
2454 :
2455 0 : for (int i = 0; i < kSize; i++)
2456 4480 : boolean_skip_table[i] = kSkipArrayEntry;
2457 0 : int skip = max_lookahead + 1 - min_lookahead;
2458 :
2459 0 : for (int i = max_lookahead; i >= min_lookahead; i--) {
2460 0 : BoyerMoorePositionInfo* map = bitmaps_[i];
2461 0 : for (int j = 0; j < kSize; j++) {
2462 21120 : if (map->at(j))
2463 581 : boolean_skip_table[j] = kDontSkipArrayEntry;
2464 : }
2465 : }
2466 :
2467 35 : return skip;
2468 : }
2469 :
2470 : // See comment on the implementation of GetSkipTable.
2471 : bool
2472 0 : BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm)
2473 : {
2474 0 : const int kSize = RegExpMacroAssembler::kTableSize;
2475 :
2476 58 : int min_lookahead = 0;
2477 0 : int max_lookahead = 0;
2478 :
2479 58 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead))
2480 : return false;
2481 :
2482 0 : bool found_single_character = false;
2483 0 : int single_character = 0;
2484 0 : for (int i = max_lookahead; i >= min_lookahead; i--) {
2485 0 : BoyerMoorePositionInfo* map = bitmaps_[i];
2486 110 : if (map->map_count() > 1 ||
2487 8 : (found_single_character && map->map_count() != 0)) {
2488 : found_single_character = false;
2489 : break;
2490 : }
2491 2812 : for (int j = 0; j < kSize; j++) {
2492 1416 : if (map->at(j)) {
2493 : found_single_character = true;
2494 : single_character = j;
2495 : break;
2496 : }
2497 : }
2498 : }
2499 :
2500 0 : int lookahead_width = max_lookahead + 1 - min_lookahead;
2501 :
2502 47 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2503 : // The mask-compare can probably handle this better.
2504 : return false;
2505 : }
2506 :
2507 0 : if (found_single_character) {
2508 0 : jit::Label cont, again;
2509 0 : masm->Bind(&again);
2510 0 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2511 0 : if (max_char_ > kSize) {
2512 0 : masm->CheckCharacterAfterAnd(single_character,
2513 : RegExpMacroAssembler::kTableMask,
2514 0 : &cont);
2515 : } else {
2516 0 : masm->CheckCharacter(single_character, &cont);
2517 : }
2518 0 : masm->AdvanceCurrentPosition(lookahead_width);
2519 0 : masm->JumpOrBacktrack(&again);
2520 0 : masm->Bind(&cont);
2521 : return true;
2522 : }
2523 :
2524 0 : RegExpShared::JitCodeTable boolean_skip_table;
2525 : {
2526 0 : AutoEnterOOMUnsafeRegion oomUnsafe;
2527 1 : boolean_skip_table.reset(static_cast<uint8_t*>(js_malloc(kSize)));
2528 35 : if (!boolean_skip_table)
2529 0 : oomUnsafe.crash("Table malloc");
2530 : }
2531 :
2532 35 : int skip_distance = GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table.get());
2533 0 : MOZ_ASSERT(skip_distance != 0);
2534 :
2535 0 : jit::Label cont, again;
2536 0 : masm->Bind(&again);
2537 0 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2538 0 : masm->CheckBitInTable(std::move(boolean_skip_table), &cont);
2539 0 : masm->AdvanceCurrentPosition(skip_distance);
2540 35 : masm->JumpOrBacktrack(&again);
2541 35 : masm->Bind(&cont);
2542 :
2543 : return true;
2544 : }
2545 :
2546 : bool
2547 0 : RegExpCompiler::CheckOverRecursed()
2548 : {
2549 1795 : if (!CheckRecursionLimitDontReport(cx())) {
2550 : #ifdef JS_MORE_DETERMINISTIC
2551 : // We don't report overrecursion here, but we throw an exception later
2552 : // and this still affects differential testing. Mimic ReportOverRecursed
2553 : // (the fuzzers check for this particular string).
2554 : fprintf(stderr, "ReportOverRecursed called\n");
2555 : #endif
2556 0 : SetRegExpTooBig();
2557 0 : return false;
2558 : }
2559 :
2560 : return true;
2561 : }
2562 :
2563 : bool
2564 0 : BoyerMooreLookahead::CheckOverRecursed()
2565 : {
2566 1580 : return compiler()->CheckOverRecursed();
2567 : }
2568 :
2569 : // -------------------------------------------------------------------
2570 : // Trace
2571 :
2572 0 : bool Trace::DeferredAction::Mentions(int that)
2573 : {
2574 12330 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
2575 85 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
2576 : return range.Contains(that);
2577 : }
2578 12245 : return reg() == that;
2579 : }
2580 :
2581 0 : bool Trace::mentions_reg(int reg)
2582 : {
2583 127 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2584 0 : if (action->Mentions(reg))
2585 : return true;
2586 : }
2587 : return false;
2588 : }
2589 :
2590 : bool
2591 0 : Trace::GetStoredPosition(int reg, int* cp_offset)
2592 : {
2593 0 : MOZ_ASSERT(0 == *cp_offset);
2594 0 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2595 0 : if (action->Mentions(reg)) {
2596 0 : if (action->action_type() == ActionNode::STORE_POSITION) {
2597 0 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
2598 0 : return true;
2599 : }
2600 : return false;
2601 : }
2602 : }
2603 : return false;
2604 : }
2605 :
2606 : int
2607 0 : Trace::FindAffectedRegisters(LifoAlloc* alloc, OutSet* affected_registers)
2608 : {
2609 0 : int max_register = RegExpCompiler::kNoRegister;
2610 0 : for (DeferredAction* action = actions_; action != nullptr; action = action->next()) {
2611 0 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
2612 0 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
2613 0 : for (int i = range.from(); i <= range.to(); i++)
2614 60 : affected_registers->Set(alloc, i);
2615 0 : if (range.to() > max_register) max_register = range.to();
2616 : } else {
2617 3351 : affected_registers->Set(alloc, action->reg());
2618 3351 : if (action->reg() > max_register) max_register = action->reg();
2619 : }
2620 : }
2621 1136 : return max_register;
2622 : }
2623 :
2624 : void
2625 1136 : Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
2626 : int max_register,
2627 : OutSet& registers_to_pop,
2628 : OutSet& registers_to_clear)
2629 : {
2630 0 : for (int reg = max_register; reg >= 0; reg--) {
2631 12956 : if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
2632 0 : else if (registers_to_clear.Get(reg)) {
2633 : int clear_to = reg;
2634 0 : while (reg > 0 && registers_to_clear.Get(reg - 1))
2635 897 : reg--;
2636 859 : assembler->ClearRegisters(reg, clear_to);
2637 : }
2638 : }
2639 1136 : }
2640 :
2641 : enum DeferredActionUndoType {
2642 : DEFER_IGNORE,
2643 : DEFER_RESTORE,
2644 : DEFER_CLEAR
2645 : };
2646 :
2647 : void
2648 1136 : Trace::PerformDeferredActions(LifoAlloc* alloc,
2649 : RegExpMacroAssembler* assembler,
2650 : int max_register,
2651 : OutSet& affected_registers,
2652 : OutSet* registers_to_pop,
2653 : OutSet* registers_to_clear)
2654 : {
2655 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
2656 1136 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
2657 :
2658 : // Count pushes performed to force a stack limit check occasionally.
2659 0 : int pushes = 0;
2660 :
2661 14989 : for (int reg = 0; reg <= max_register; reg++) {
2662 13853 : if (!affected_registers.Get(reg))
2663 : continue;
2664 :
2665 : // The chronologically first deferred action in the trace
2666 : // is used to infer the action needed to restore a register
2667 : // to its previous state (or not, if it's safe to ignore it).
2668 0 : DeferredActionUndoType undo_action = DEFER_IGNORE;
2669 :
2670 0 : int value = 0;
2671 0 : bool absolute = false;
2672 3362 : bool clear = false;
2673 3362 : int store_position = -1;
2674 : // This is a little tricky because we are scanning the actions in reverse
2675 : // historical order (newest first).
2676 15692 : for (DeferredAction* action = actions_;
2677 0 : action != nullptr;
2678 : action = action->next()) {
2679 12330 : if (action->Mentions(reg)) {
2680 3411 : switch (action->action_type()) {
2681 : case ActionNode::SET_REGISTER: {
2682 : Trace::DeferredSetRegister* psr =
2683 0 : static_cast<Trace::DeferredSetRegister*>(action);
2684 0 : if (!absolute) {
2685 39 : value += psr->value();
2686 39 : absolute = true;
2687 : }
2688 : // SET_REGISTER is currently only used for newly introduced loop
2689 : // counters. They can have a significant previous value if they
2690 : // occour in a loop. TODO(lrn): Propagate this information, so
2691 : // we can set undo_action to IGNORE if we know there is no value to
2692 : // restore.
2693 0 : undo_action = DEFER_RESTORE;
2694 39 : MOZ_ASSERT(store_position == -1);
2695 39 : MOZ_ASSERT(!clear);
2696 : break;
2697 : }
2698 : case ActionNode::INCREMENT_REGISTER:
2699 43 : if (!absolute) {
2700 0 : value++;
2701 : }
2702 43 : MOZ_ASSERT(store_position == -1);
2703 43 : MOZ_ASSERT(!clear);
2704 : undo_action = DEFER_RESTORE;
2705 : break;
2706 : case ActionNode::STORE_POSITION: {
2707 : Trace::DeferredCapture* pc =
2708 0 : static_cast<Trace::DeferredCapture*>(action);
2709 3269 : if (!clear && store_position == -1) {
2710 3269 : store_position = pc->cp_offset();
2711 : }
2712 :
2713 : // For captures we know that stores and clears alternate.
2714 : // Other register, are never cleared, and if the occur
2715 : // inside a loop, they might be assigned more than once.
2716 3269 : if (reg <= 1) {
2717 : // Registers zero and one, aka "capture zero", is
2718 : // always set correctly if we succeed. There is no
2719 : // need to undo a setting on backtrack, because we
2720 : // will set it again or fail.
2721 : undo_action = DEFER_IGNORE;
2722 : } else {
2723 0 : undo_action = pc->is_capture() ? DEFER_CLEAR : DEFER_RESTORE;
2724 : }
2725 3269 : MOZ_ASSERT(!absolute);
2726 3269 : MOZ_ASSERT(value == 0);
2727 : break;
2728 : }
2729 : case ActionNode::CLEAR_CAPTURES: {
2730 : // Since we're scanning in reverse order, if we've already
2731 : // set the position we have to ignore historically earlier
2732 : // clearing operations.
2733 60 : if (store_position == -1) {
2734 0 : clear = true;
2735 : }
2736 0 : undo_action = DEFER_RESTORE;
2737 60 : MOZ_ASSERT(!absolute);
2738 60 : MOZ_ASSERT(value == 0);
2739 : break;
2740 : }
2741 : default:
2742 0 : MOZ_CRASH("Bad action");
2743 : }
2744 : }
2745 : }
2746 : // Prepare for the undo-action (e.g., push if it's going to be popped).
2747 3362 : if (undo_action == DEFER_RESTORE) {
2748 0 : pushes++;
2749 : RegExpMacroAssembler::StackCheckFlag stack_check =
2750 0 : RegExpMacroAssembler::kNoStackLimitCheck;
2751 0 : if (pushes == push_limit) {
2752 0 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
2753 0 : pushes = 0;
2754 : }
2755 :
2756 0 : assembler->PushRegister(reg, stack_check);
2757 0 : registers_to_pop->Set(alloc, reg);
2758 3220 : } else if (undo_action == DEFER_CLEAR) {
2759 1756 : registers_to_clear->Set(alloc, reg);
2760 : }
2761 : // Perform the chronologically last action (or accumulated increment)
2762 : // for the register.
2763 0 : if (store_position != -1) {
2764 0 : assembler->WriteCurrentPositionToRegister(reg, store_position);
2765 0 : } else if (clear) {
2766 0 : assembler->ClearRegisters(reg, reg);
2767 0 : } else if (absolute) {
2768 0 : assembler->SetRegister(reg, value);
2769 43 : } else if (value != 0) {
2770 43 : assembler->AdvanceRegister(reg, value);
2771 : }
2772 : }
2773 1136 : }
2774 :
2775 : // This is called as we come into a loop choice node and some other tricky
2776 : // nodes. It normalizes the state of the code generator to ensure we can
2777 : // generate generic code.
2778 0 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor)
2779 : {
2780 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2781 :
2782 0 : MOZ_ASSERT(!is_trivial());
2783 :
2784 1246 : if (actions_ == nullptr && backtrack() == nullptr) {
2785 : // Here we just have some deferred cp advances to fix and we are back to
2786 : // a normal situation. We may also have to forget some information gained
2787 : // through a quick check that was already performed.
2788 0 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
2789 : // Create a new trivial state and generate the node with that.
2790 110 : Trace new_state;
2791 110 : successor->Emit(compiler, &new_state);
2792 : return;
2793 : }
2794 :
2795 : // Generate deferred actions here along with code to undo them again.
2796 0 : OutSet affected_registers;
2797 :
2798 1136 : if (backtrack() != nullptr) {
2799 : // Here we have a concrete backtrack location. These are set up by choice
2800 : // nodes and so they indicate that we have a deferred save of the current
2801 : // position which we may need to emit here.
2802 976 : assembler->PushCurrentPosition();
2803 : }
2804 :
2805 0 : int max_register = FindAffectedRegisters(compiler->alloc(), &affected_registers);
2806 0 : OutSet registers_to_pop;
2807 1136 : OutSet registers_to_clear;
2808 1136 : PerformDeferredActions(compiler->alloc(),
2809 : assembler,
2810 : max_register,
2811 : affected_registers,
2812 : ®isters_to_pop,
2813 0 : ®isters_to_clear);
2814 1136 : if (cp_offset_ != 0)
2815 1031 : assembler->AdvanceCurrentPosition(cp_offset_);
2816 :
2817 : // Create a new trivial state and generate the node with that.
2818 0 : jit::Label undo;
2819 0 : assembler->PushBacktrack(&undo);
2820 1136 : Trace new_state;
2821 1136 : successor->Emit(compiler, &new_state);
2822 :
2823 : // On backtrack we need to restore state.
2824 1136 : assembler->BindBacktrack(&undo);
2825 : RestoreAffectedRegisters(assembler,
2826 : max_register,
2827 : registers_to_pop,
2828 0 : registers_to_clear);
2829 1136 : if (backtrack() == nullptr) {
2830 0 : assembler->Backtrack();
2831 : } else {
2832 976 : assembler->PopCurrentPosition();
2833 976 : assembler->JumpOrBacktrack(backtrack());
2834 : }
2835 : }
2836 :
2837 : void
2838 0 : Trace::InvalidateCurrentCharacter()
2839 : {
2840 129 : characters_preloaded_ = 0;
2841 0 : }
2842 :
2843 : void
2844 0 : Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler)
2845 : {
2846 1364 : MOZ_ASSERT(by > 0);
2847 : // We don't have an instruction for shifting the current character register
2848 : // down or for using a shifted value for anything so lets just forget that
2849 : // we preloaded any characters into it.
2850 1364 : characters_preloaded_ = 0;
2851 : // Adjust the offsets of the quick check performed information. This
2852 : // information is used to find out what we already determined about the
2853 : // characters by means of mask and compare.
2854 0 : quick_check_performed_.Advance(by);
2855 1 : cp_offset_ += by;
2856 1 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2857 0 : compiler->SetRegExpTooBig();
2858 0 : cp_offset_ = 0;
2859 : }
2860 2728 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2861 1364 : }
2862 :
2863 : void
2864 0 : OutSet::Set(LifoAlloc* alloc, unsigned value)
2865 : {
2866 5309 : if (value < kFirstLimit) {
2867 0 : first_ |= (1 << value);
2868 : } else {
2869 304 : if (remaining_ == nullptr)
2870 0 : remaining_ = alloc->newInfallible<RemainingVector>(*alloc);
2871 :
2872 1064 : for (size_t i = 0; i < remaining().length(); i++) {
2873 304 : if (remaining()[i] == value)
2874 : return;
2875 : }
2876 304 : remaining().append(value);
2877 : }
2878 : }
2879 :
2880 : bool
2881 0 : OutSet::Get(unsigned value)
2882 : {
2883 0 : if (value < kFirstLimit)
2884 26400 : return (first_ & (1 << value)) != 0;
2885 0 : if (remaining_ == nullptr)
2886 : return false;
2887 79120 : for (size_t i = 0; i < remaining().length(); i++) {
2888 39940 : if (remaining()[i] == value)
2889 : return true;
2890 : }
2891 : return false;
2892 : }
2893 :
2894 : // -------------------------------------------------------------------
2895 : // Graph emitting
2896 :
2897 : void
2898 0 : NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace)
2899 : {
2900 1 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2901 :
2902 : // Omit flushing the trace. We discard the entire stack frame anyway.
2903 :
2904 2 : if (!label()->bound()) {
2905 : // We are completely independent of the trace, since we ignore it,
2906 : // so this code can be used as the generic version.
2907 2 : assembler->Bind(label());
2908 : }
2909 :
2910 : // Throw away everything on the backtrack stack since the start
2911 : // of the negative submatch and restore the character position.
2912 1 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
2913 0 : assembler->ReadBacktrackStackPointerFromRegister(stack_pointer_register_);
2914 :
2915 1 : if (clear_capture_count_ > 0) {
2916 : // Clear any captures that might have been performed during the success
2917 : // of the body of the negative look-ahead.
2918 0 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
2919 0 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
2920 : }
2921 :
2922 : // Now that we have unwound the stack we find at the top of the stack the
2923 : // backtrack that the BeginSubmatch node got.
2924 1 : assembler->Backtrack();
2925 1 : }
2926 :
2927 : void
2928 0 : EndNode::Emit(RegExpCompiler* compiler, Trace* trace)
2929 : {
2930 0 : if (!trace->is_trivial()) {
2931 551 : trace->Flush(compiler, this);
2932 0 : return;
2933 : }
2934 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2935 1102 : if (!label()->bound()) {
2936 0 : assembler->Bind(label());
2937 : }
2938 0 : switch (action_) {
2939 : case ACCEPT:
2940 551 : assembler->Succeed();
2941 0 : return;
2942 : case BACKTRACK:
2943 0 : assembler->JumpOrBacktrack(trace->backtrack());
2944 0 : return;
2945 : case NEGATIVE_SUBMATCH_SUCCESS:
2946 : // This case is handled in a different virtual method.
2947 0 : MOZ_CRASH("Bad action: NEGATIVE_SUBMATCH_SUCCESS");
2948 : }
2949 0 : MOZ_CRASH("Bad action");
2950 : }
2951 :
2952 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2953 : // that matches newline or the start of input).
2954 : static void
2955 0 : EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
2956 : {
2957 1 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2958 :
2959 : // We will be loading the previous character into the current character
2960 : // register.
2961 1 : Trace new_trace(*trace);
2962 0 : new_trace.InvalidateCurrentCharacter();
2963 :
2964 2 : jit::Label ok;
2965 1 : if (new_trace.cp_offset() == 0) {
2966 : // The start of input counts as a newline in this context, so skip to
2967 : // ok if we are at the start.
2968 1 : assembler->CheckAtStart(&ok);
2969 : }
2970 :
2971 : // We already checked that we are not at the start of input so it must be
2972 : // OK to load the previous character.
2973 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, new_trace.backtrack(), false);
2974 :
2975 1 : if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2976 : // Newline means \n, \r, 0x2028 or 0x2029.
2977 0 : if (!compiler->latin1())
2978 0 : assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2979 0 : assembler->CheckCharacter('\n', &ok);
2980 0 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2981 : }
2982 0 : assembler->Bind(&ok);
2983 1 : on_success->Emit(compiler, &new_trace);
2984 1 : }
2985 :
2986 : // Assert that the next character cannot be a part of a surrogate pair.
2987 : static void
2988 0 : EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
2989 : {
2990 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2991 :
2992 : // We will be loading the previous character into the current character
2993 : // register.
2994 0 : Trace new_trace(*trace);
2995 0 : new_trace.InvalidateCurrentCharacter();
2996 :
2997 0 : jit::Label ok;
2998 0 : if (new_trace.cp_offset() == 0)
2999 0 : assembler->CheckAtStart(&ok);
3000 :
3001 : // We already checked that we are not at the start of input so it must be
3002 : // OK to load the previous character.
3003 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
3004 0 : assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
3005 0 : new_trace.backtrack());
3006 :
3007 0 : assembler->Bind(&ok);
3008 0 : on_success->Emit(compiler, &new_trace);
3009 0 : }
3010 :
3011 : // Assert that the next character is not a trail surrogate that has a
3012 : // corresponding lead surrogate.
3013 : static void
3014 0 : EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
3015 : {
3016 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3017 :
3018 0 : jit::Label ok;
3019 0 : assembler->CheckPosition(trace->cp_offset(), &ok);
3020 :
3021 : // We will be loading the next and previous characters into the current
3022 : // character register.
3023 0 : Trace new_trace(*trace);
3024 0 : new_trace.InvalidateCurrentCharacter();
3025 :
3026 0 : if (new_trace.cp_offset() == 0)
3027 0 : assembler->CheckAtStart(&ok);
3028 :
3029 : // First check if next character is a trail surrogate.
3030 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset(), new_trace.backtrack(), false);
3031 : assembler->CheckCharacterNotInRange(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax,
3032 0 : &ok);
3033 :
3034 : // Next check if previous character is a lead surrogate.
3035 : // We already checked that we are not at the start of input so it must be
3036 : // OK to load the previous character.
3037 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
3038 0 : assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
3039 0 : new_trace.backtrack());
3040 :
3041 0 : assembler->Bind(&ok);
3042 0 : on_success->Emit(compiler, &new_trace);
3043 0 : }
3044 :
3045 : // Check for [0-9A-Z_a-z].
3046 : static void
3047 2 : EmitWordCheck(RegExpMacroAssembler* assembler,
3048 : jit::Label* word, jit::Label* non_word, bool fall_through_on_word,
3049 : bool unicode_ignore_case)
3050 : {
3051 0 : if (!unicode_ignore_case &&
3052 2 : assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
3053 2 : fall_through_on_word ? non_word : word))
3054 : {
3055 : // Optimized implementation available.
3056 : return;
3057 : }
3058 :
3059 0 : if (unicode_ignore_case) {
3060 0 : assembler->CheckCharacter(0x017F, word);
3061 0 : assembler->CheckCharacter(0x212A, word);
3062 : }
3063 :
3064 0 : assembler->CheckCharacterGT('z', non_word);
3065 0 : assembler->CheckCharacterLT('0', non_word);
3066 0 : assembler->CheckCharacterGT('a' - 1, word);
3067 0 : assembler->CheckCharacterLT('9' + 1, word);
3068 0 : assembler->CheckCharacterLT('A', non_word);
3069 0 : assembler->CheckCharacterLT('Z' + 1, word);
3070 :
3071 0 : if (fall_through_on_word)
3072 0 : assembler->CheckNotCharacter('_', non_word);
3073 : else
3074 0 : assembler->CheckCharacter('_', word);
3075 : }
3076 :
3077 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3078 : void
3079 0 : AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace)
3080 : {
3081 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3082 0 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3083 0 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3084 4 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3085 0 : if (lookahead == nullptr) {
3086 : int eats_at_least =
3087 0 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3088 : kRecursionBudget,
3089 2 : not_at_start));
3090 0 : if (eats_at_least >= 1) {
3091 : BoyerMooreLookahead* bm =
3092 0 : alloc()->newInfallible<BoyerMooreLookahead>(alloc(), eats_at_least, compiler);
3093 0 : FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3094 0 : if (bm->at(0)->is_non_word())
3095 1 : next_is_word_character = Trace::FALSE_VALUE;
3096 1 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3097 : }
3098 : } else {
3099 0 : if (lookahead->at(0)->is_non_word())
3100 0 : next_is_word_character = Trace::FALSE_VALUE;
3101 1 : if (lookahead->at(0)->is_word())
3102 0 : next_is_word_character = Trace::TRUE_VALUE;
3103 : }
3104 0 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3105 0 : if (next_is_word_character == Trace::UNKNOWN) {
3106 0 : jit::Label before_non_word;
3107 0 : jit::Label before_word;
3108 0 : if (trace->characters_preloaded() != 1) {
3109 0 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3110 : }
3111 : // Fall through on non-word.
3112 0 : EmitWordCheck(assembler, &before_word, &before_non_word, false,
3113 0 : compiler->unicode() && compiler->ignore_case());
3114 : // Next character is not a word character.
3115 0 : assembler->Bind(&before_non_word);
3116 0 : jit::Label ok;
3117 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3118 0 : assembler->JumpOrBacktrack(&ok);
3119 :
3120 0 : assembler->Bind(&before_word);
3121 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3122 0 : assembler->Bind(&ok);
3123 2 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3124 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3125 : } else {
3126 1 : MOZ_ASSERT(next_is_word_character == Trace::FALSE_VALUE);
3127 0 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3128 : }
3129 2 : }
3130 :
3131 : void
3132 2 : AssertionNode::BacktrackIfPrevious(RegExpCompiler* compiler,
3133 : Trace* trace,
3134 : AssertionNode::IfPrevious backtrack_if_previous)
3135 : {
3136 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3137 2 : Trace new_trace(*trace);
3138 0 : new_trace.InvalidateCurrentCharacter();
3139 :
3140 0 : jit::Label fall_through, dummy;
3141 :
3142 2 : jit::Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() : &fall_through;
3143 0 : jit::Label* word = backtrack_if_previous == kIsNonWord ? &fall_through : new_trace.backtrack();
3144 :
3145 2 : if (new_trace.cp_offset() == 0) {
3146 : // The start of input counts as a non-word character, so the question is
3147 : // decided if we are at the start.
3148 1 : assembler->CheckAtStart(non_word);
3149 : }
3150 : // We already checked that we are not at the start of input so it must be
3151 : // OK to load the previous character.
3152 0 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3153 2 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord,
3154 0 : compiler->unicode() && compiler->ignore_case());
3155 :
3156 0 : assembler->Bind(&fall_through);
3157 2 : on_success()->Emit(compiler, &new_trace);
3158 2 : }
3159 :
3160 : void
3161 116 : AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3162 : RegExpCompiler* compiler,
3163 : int filled_in,
3164 : bool not_at_start)
3165 : {
3166 116 : if (assertion_type_ == AT_START && not_at_start) {
3167 72 : details->set_cannot_match();
3168 : return;
3169 : }
3170 44 : return on_success()->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
3171 : }
3172 :
3173 : void
3174 0 : AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace)
3175 : {
3176 522 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3177 0 : switch (assertion_type_) {
3178 : case AT_END: {
3179 0 : jit::Label ok;
3180 0 : assembler->CheckPosition(trace->cp_offset(), &ok);
3181 408 : assembler->JumpOrBacktrack(trace->backtrack());
3182 408 : assembler->Bind(&ok);
3183 : break;
3184 : }
3185 : case AT_START: {
3186 0 : if (trace->at_start() == Trace::FALSE_VALUE) {
3187 0 : assembler->JumpOrBacktrack(trace->backtrack());
3188 0 : return;
3189 : }
3190 0 : if (trace->at_start() == Trace::UNKNOWN) {
3191 0 : assembler->CheckNotAtStart(trace->backtrack());
3192 0 : Trace at_start_trace = *trace;
3193 111 : at_start_trace.set_at_start(true);
3194 111 : on_success()->Emit(compiler, &at_start_trace);
3195 : return;
3196 : }
3197 : }
3198 : break;
3199 : case AFTER_NEWLINE:
3200 1 : EmitHat(compiler, on_success(), trace);
3201 1 : return;
3202 : case AT_BOUNDARY:
3203 : case AT_NON_BOUNDARY: {
3204 2 : EmitBoundaryCheck(compiler, trace);
3205 2 : return;
3206 : }
3207 : case NOT_AFTER_LEAD_SURROGATE:
3208 0 : EmitNotAfterLeadSurrogate(compiler, on_success(), trace);
3209 0 : return;
3210 : case NOT_IN_SURROGATE_PAIR:
3211 0 : EmitNotInSurrogatePair(compiler, on_success(), trace);
3212 0 : return;
3213 : }
3214 408 : on_success()->Emit(compiler, trace);
3215 : }
3216 :
3217 : static bool
3218 : DeterminedAlready(QuickCheckDetails* quick_check, int offset)
3219 : {
3220 0 : if (quick_check == nullptr)
3221 : return false;
3222 0 : if (offset >= quick_check->characters())
3223 : return false;
3224 10551 : return quick_check->positions(offset)->determines_perfectly;
3225 : }
3226 :
3227 : static void
3228 : UpdateBoundsCheck(int index, int* checked_up_to)
3229 : {
3230 20807 : if (index > *checked_up_to)
3231 1090 : *checked_up_to = index;
3232 : }
3233 :
3234 : static void
3235 8 : EmitBoundaryTest(RegExpMacroAssembler* masm,
3236 : int border,
3237 : jit::Label* fall_through,
3238 : jit::Label* above_or_equal,
3239 : jit::Label* below)
3240 : {
3241 0 : if (below != fall_through) {
3242 0 : masm->CheckCharacterLT(border, below);
3243 8 : if (above_or_equal != fall_through)
3244 0 : masm->JumpOrBacktrack(above_or_equal);
3245 : } else {
3246 0 : masm->CheckCharacterGT(border - 1, above_or_equal);
3247 : }
3248 8 : }
3249 :
3250 : static void
3251 178 : EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
3252 : int first,
3253 : int last,
3254 : jit::Label* fall_through,
3255 : jit::Label* in_range,
3256 : jit::Label* out_of_range)
3257 : {
3258 0 : if (in_range == fall_through) {
3259 48 : if (first == last)
3260 0 : masm->CheckNotCharacter(first, out_of_range);
3261 : else
3262 0 : masm->CheckCharacterNotInRange(first, last, out_of_range);
3263 : } else {
3264 130 : if (first == last)
3265 0 : masm->CheckCharacter(first, in_range);
3266 : else
3267 0 : masm->CheckCharacterInRange(first, last, in_range);
3268 130 : if (out_of_range != fall_through)
3269 0 : masm->JumpOrBacktrack(out_of_range);
3270 : }
3271 178 : }
3272 :
3273 : typedef InfallibleVector<int, 4> RangeBoundaryVector;
3274 :
3275 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
3276 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
3277 : static void
3278 17 : EmitUseLookupTable(RegExpMacroAssembler* masm,
3279 : RangeBoundaryVector& ranges,
3280 : int start_index,
3281 : int end_index,
3282 : int min_char,
3283 : jit::Label* fall_through,
3284 : jit::Label* even_label,
3285 : jit::Label* odd_label)
3286 : {
3287 : static const int kSize = RegExpMacroAssembler::kTableSize;
3288 : static const int kMask = RegExpMacroAssembler::kTableMask;
3289 :
3290 51 : DebugOnly<int> base = (min_char & ~kMask);
3291 :
3292 : // Assert that everything is on one kTableSize page.
3293 0 : for (int i = start_index; i <= end_index; i++)
3294 316 : MOZ_ASSERT((ranges[i] & ~kMask) == base);
3295 17 : MOZ_ASSERT(start_index == 0 || (ranges[start_index - 1] & ~kMask) <= base);
3296 :
3297 : char templ[kSize];
3298 : jit::Label* on_bit_set;
3299 : jit::Label* on_bit_clear;
3300 : int bit;
3301 17 : if (even_label == fall_through) {
3302 : on_bit_set = odd_label;
3303 : on_bit_clear = even_label;
3304 : bit = 1;
3305 : } else {
3306 0 : on_bit_set = even_label;
3307 17 : on_bit_clear = odd_label;
3308 0 : bit = 0;
3309 : }
3310 0 : for (int i = 0; i < (ranges[start_index] & kMask) && i < kSize; i++)
3311 0 : templ[i] = bit;
3312 0 : int j = 0;
3313 0 : bit ^= 1;
3314 0 : for (int i = start_index; i < end_index; i++) {
3315 2405 : for (j = (ranges[i] & kMask); j < (ranges[i + 1] & kMask); j++) {
3316 0 : templ[j] = bit;
3317 : }
3318 0 : bit ^= 1;
3319 : }
3320 965 : for (int i = j; i < kSize; i++) {
3321 474 : templ[i] = bit;
3322 : }
3323 :
3324 : // TODO(erikcorry): Cache these.
3325 0 : RegExpShared::JitCodeTable ba;
3326 : {
3327 0 : AutoEnterOOMUnsafeRegion oomUnsafe;
3328 1 : ba.reset(static_cast<uint8_t*>(js_malloc(kSize)));
3329 17 : if (!ba)
3330 0 : oomUnsafe.crash("Table malloc");
3331 : }
3332 :
3333 2193 : for (int i = 0; i < kSize; i++)
3334 0 : ba[i] = templ[i];
3335 :
3336 0 : masm->CheckBitInTable(std::move(ba), on_bit_set);
3337 0 : if (on_bit_clear != fall_through)
3338 17 : masm->JumpOrBacktrack(on_bit_clear);
3339 17 : }
3340 :
3341 : static void
3342 95 : CutOutRange(RegExpMacroAssembler* masm,
3343 : RangeBoundaryVector& ranges,
3344 : int start_index,
3345 : int end_index,
3346 : int cut_index,
3347 : jit::Label* even_label,
3348 : jit::Label* odd_label)
3349 : {
3350 0 : bool odd = (((cut_index - start_index) & 1) == 1);
3351 0 : jit::Label* in_range_label = odd ? odd_label : even_label;
3352 0 : jit::Label dummy;
3353 0 : EmitDoubleBoundaryTest(masm,
3354 190 : ranges[cut_index],
3355 190 : ranges[cut_index + 1] - 1,
3356 : &dummy,
3357 : in_range_label,
3358 95 : &dummy);
3359 95 : MOZ_ASSERT(!dummy.used());
3360 : // Cut out the single range by rewriting the array. This creates a new
3361 : // range that is a merger of the two ranges on either side of the one we
3362 : // are cutting out. The oddity of the labels is preserved.
3363 0 : for (int j = cut_index; j > start_index; j--)
3364 0 : ranges[j] = ranges[j - 1];
3365 0 : for (int j = cut_index + 1; j < end_index; j++)
3366 621 : ranges[j] = ranges[j + 1];
3367 95 : }
3368 :
3369 : // Unicode case. Split the search space into kSize spaces that are handled
3370 : // with recursion.
3371 : static void
3372 27 : SplitSearchSpace(RangeBoundaryVector& ranges,
3373 : int start_index,
3374 : int end_index,
3375 : int* new_start_index,
3376 : int* new_end_index,
3377 : int* border)
3378 : {
3379 : static const int kSize = RegExpMacroAssembler::kTableSize;
3380 : static const int kMask = RegExpMacroAssembler::kTableMask;
3381 :
3382 54 : int first = ranges[start_index];
3383 0 : int last = ranges[end_index] - 1;
3384 :
3385 0 : *new_start_index = start_index;
3386 0 : *border = (ranges[start_index] & ~kMask) + kSize;
3387 443 : while (*new_start_index < end_index) {
3388 0 : if (ranges[*new_start_index] > *border)
3389 : break;
3390 208 : (*new_start_index)++;
3391 : }
3392 : // new_start_index is the index of the first edge that is beyond the
3393 : // current kSize space.
3394 :
3395 : // For very large search spaces we do a binary chop search of the non-ASCII
3396 : // space instead of just going to the end of the current kSize space. The
3397 : // heuristics are complicated a little by the fact that any 128-character
3398 : // encoding space can be quickly tested with a table lookup, so we don't
3399 : // wish to do binary chop search at a smaller granularity than that. A
3400 : // 128-character space can take up a lot of space in the ranges array if,
3401 : // for example, we only want to match every second character (eg. the lower
3402 : // case characters on some Unicode pages).
3403 27 : int binary_chop_index = (end_index + start_index) / 2;
3404 : // The first test ensures that we get to the code that handles the ASCII
3405 : // range with a single not-taken branch, speeding up this important
3406 : // character range (even non-ASCII charset-based text has spaces and
3407 : // punctuation).
3408 0 : if (*border - 1 > kMaxOneByteCharCode && // ASCII case.
3409 0 : end_index - start_index > (*new_start_index - start_index) * 2 &&
3410 0 : last - first > kSize * 2 &&
3411 29 : binary_chop_index > *new_start_index &&
3412 0 : ranges[binary_chop_index] >= first + 2 * kSize)
3413 : {
3414 2 : int scan_forward_for_section_border = binary_chop_index;;
3415 0 : int new_border = (ranges[binary_chop_index] | kMask) + 1;
3416 :
3417 0 : while (scan_forward_for_section_border < end_index) {
3418 0 : if (ranges[scan_forward_for_section_border] > new_border) {
3419 0 : *new_start_index = scan_forward_for_section_border;
3420 1 : *border = new_border;
3421 0 : break;
3422 : }
3423 7 : scan_forward_for_section_border++;
3424 : }
3425 : }
3426 :
3427 0 : MOZ_ASSERT(*new_start_index > start_index);
3428 0 : *new_end_index = *new_start_index - 1;
3429 0 : if (ranges[*new_end_index] == *border)
3430 0 : (*new_end_index)--;
3431 0 : if (*border >= ranges[end_index]) {
3432 0 : *border = ranges[end_index];
3433 22 : *new_start_index = end_index; // Won't be used.
3434 0 : *new_end_index = end_index - 1;
3435 : }
3436 27 : }
3437 :
3438 : // Gets a series of segment boundaries representing a character class. If the
3439 : // character is in the range between an even and an odd boundary (counting from
3440 : // start_index) then go to even_label, otherwise go to odd_label. We already
3441 : // know that the character is in the range of min_char to max_char inclusive.
3442 : // Either label can be nullptr indicating backtracking. Either label can also be
3443 : // equal to the fall_through label.
3444 : static void
3445 232 : GenerateBranches(RegExpMacroAssembler* masm,
3446 : RangeBoundaryVector& ranges,
3447 : int start_index,
3448 : int end_index,
3449 : char16_t min_char,
3450 : char16_t max_char,
3451 : jit::Label* fall_through,
3452 : jit::Label* even_label,
3453 : jit::Label* odd_label)
3454 : {
3455 464 : int first = ranges[start_index];
3456 0 : int last = ranges[end_index] - 1;
3457 :
3458 232 : MOZ_ASSERT(min_char < first);
3459 :
3460 : // Just need to test if the character is before or on-or-after
3461 : // a particular character.
3462 0 : if (start_index == end_index) {
3463 8 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
3464 8 : return;
3465 : }
3466 :
3467 : // Another almost trivial case: There is one interval in the middle that is
3468 : // different from the end intervals.
3469 0 : if (start_index + 1 == end_index) {
3470 83 : EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, odd_label);
3471 83 : return;
3472 : }
3473 :
3474 : // It's not worth using table lookup if there are very few intervals in the
3475 : // character class.
3476 141 : if (end_index - start_index <= 6) {
3477 : // It is faster to test for individual characters, so we look for those
3478 : // first, then try arbitrary ranges in the second round.
3479 : static int kNoCutIndex = -1;
3480 0 : int cut = kNoCutIndex;
3481 95 : for (int i = start_index; i < end_index; i++) {
3482 804 : if (ranges[i] == ranges[i + 1] - 1) {
3483 : cut = i;
3484 : break;
3485 : }
3486 : }
3487 0 : if (cut == kNoCutIndex) cut = start_index;
3488 0 : CutOutRange(masm, ranges, start_index, end_index, cut, even_label, odd_label);
3489 95 : MOZ_ASSERT(end_index - start_index >= 2);
3490 95 : GenerateBranches(masm,
3491 : ranges,
3492 : start_index + 1,
3493 : end_index - 1,
3494 : min_char,
3495 : max_char,
3496 : fall_through,
3497 : even_label,
3498 95 : odd_label);
3499 95 : return;
3500 : }
3501 :
3502 : // If there are a lot of intervals in the regexp, then we will use tables to
3503 : // determine whether the character is inside or outside the character class.
3504 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
3505 :
3506 46 : if ((max_char >> kBits) == (min_char >> kBits)) {
3507 : EmitUseLookupTable(masm,
3508 : ranges,
3509 : start_index,
3510 : end_index,
3511 : min_char,
3512 : fall_through,
3513 : even_label,
3514 17 : odd_label);
3515 17 : return;
3516 : }
3517 :
3518 29 : if ((min_char >> kBits) != (first >> kBits)) {
3519 2 : masm->CheckCharacterLT(first, odd_label);
3520 : GenerateBranches(masm,
3521 : ranges,
3522 : start_index + 1,
3523 : end_index,
3524 : first,
3525 : max_char,
3526 : fall_through,
3527 : odd_label,
3528 2 : even_label);
3529 2 : return;
3530 : }
3531 :
3532 0 : int new_start_index = 0;
3533 27 : int new_end_index = 0;
3534 27 : int border = 0;
3535 :
3536 : SplitSearchSpace(ranges,
3537 : start_index,
3538 : end_index,
3539 : &new_start_index,
3540 : &new_end_index,
3541 0 : &border);
3542 :
3543 0 : jit::Label handle_rest;
3544 27 : jit::Label* above = &handle_rest;
3545 27 : if (border == last + 1) {
3546 : // We didn't find any section that started after the limit, so everything
3547 : // above the border is one of the terminal labels.
3548 22 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
3549 22 : MOZ_ASSERT(new_end_index == end_index - 1);
3550 : }
3551 :
3552 0 : MOZ_ASSERT(start_index <= new_end_index);
3553 0 : MOZ_ASSERT(new_start_index <= end_index);
3554 0 : MOZ_ASSERT(start_index < new_start_index);
3555 27 : MOZ_ASSERT(new_end_index < end_index);
3556 27 : MOZ_ASSERT(new_end_index + 1 == new_start_index ||
3557 : (new_end_index + 2 == new_start_index &&
3558 : border == ranges[new_end_index + 1]));
3559 0 : MOZ_ASSERT(min_char < border - 1);
3560 0 : MOZ_ASSERT(border < max_char);
3561 54 : MOZ_ASSERT(ranges[new_end_index] < border);
3562 76 : MOZ_ASSERT(border < ranges[new_start_index] ||
3563 : (border == ranges[new_start_index] &&
3564 : new_start_index == end_index &&
3565 : new_end_index == end_index - 1 &&
3566 : border == last + 1));
3567 0 : MOZ_ASSERT(new_start_index == 0 || border >= ranges[new_start_index - 1]);
3568 :
3569 0 : masm->CheckCharacterGT(border - 1, above);
3570 54 : jit::Label dummy;
3571 27 : GenerateBranches(masm,
3572 : ranges,
3573 : start_index,
3574 : new_end_index,
3575 : min_char,
3576 27 : border - 1,
3577 : &dummy,
3578 : even_label,
3579 0 : odd_label);
3580 0 : if (handle_rest.used()) {
3581 0 : masm->Bind(&handle_rest);
3582 5 : bool flip = (new_start_index & 1) != (start_index & 1);
3583 5 : GenerateBranches(masm,
3584 : ranges,
3585 : new_start_index,
3586 : end_index,
3587 : border,
3588 : max_char,
3589 : &dummy,
3590 : flip ? odd_label : even_label,
3591 5 : flip ? even_label : odd_label);
3592 : }
3593 : }
3594 :
3595 : static void
3596 248 : EmitCharClass(LifoAlloc* alloc,
3597 : RegExpMacroAssembler* macro_assembler,
3598 : RegExpCharacterClass* cc,
3599 : bool latin1,
3600 : jit::Label* on_failure,
3601 : int cp_offset,
3602 : bool check_offset,
3603 : bool preloaded)
3604 : {
3605 0 : CharacterRangeVector& ranges = cc->ranges(alloc);
3606 248 : if (!CharacterRange::IsCanonical(ranges)) {
3607 28 : CharacterRange::Canonicalize(ranges);
3608 : }
3609 :
3610 248 : int max_char = MaximumCharacter(latin1);
3611 0 : int range_count = ranges.length();
3612 :
3613 0 : int last_valid_range = range_count - 1;
3614 0 : while (last_valid_range >= 0) {
3615 762 : CharacterRange& range = ranges[last_valid_range];
3616 381 : if (range.from() <= max_char) {
3617 : break;
3618 : }
3619 133 : last_valid_range--;
3620 : }
3621 :
3622 1 : if (last_valid_range < 0) {
3623 0 : if (!cc->is_negated()) {
3624 0 : macro_assembler->JumpOrBacktrack(on_failure);
3625 : }
3626 0 : if (check_offset) {
3627 0 : macro_assembler->CheckPosition(cp_offset, on_failure);
3628 : }
3629 145 : return;
3630 : }
3631 :
3632 0 : if (last_valid_range == 0 &&
3633 0 : ranges[0].IsEverything(max_char)) {
3634 77 : if (cc->is_negated()) {
3635 0 : macro_assembler->JumpOrBacktrack(on_failure);
3636 : } else {
3637 : // This is a common case hit by non-anchored expressions.
3638 77 : if (check_offset) {
3639 5 : macro_assembler->CheckPosition(cp_offset, on_failure);
3640 : }
3641 : }
3642 : return;
3643 : }
3644 0 : if (last_valid_range == 0 &&
3645 191 : !cc->is_negated() &&
3646 1 : ranges[0].IsEverything(max_char)) {
3647 : // This is a common case hit by non-anchored expressions.
3648 0 : if (check_offset) {
3649 0 : macro_assembler->CheckPosition(cp_offset, on_failure);
3650 : }
3651 : return;
3652 : }
3653 :
3654 171 : if (!preloaded) {
3655 159 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
3656 : }
3657 :
3658 0 : if (cc->is_standard(alloc) &&
3659 71 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
3660 71 : on_failure)) {
3661 : return;
3662 : }
3663 :
3664 : // A new list with ascending entries. Each entry is a code unit
3665 : // where there is a boundary between code units that are part of
3666 : // the class and code units that are not. Normally we insert an
3667 : // entry at zero which goes to the failure label, but if there
3668 : // was already one there we fall through for success on that entry.
3669 : // Subsequent entries have alternating meaning (success/failure).
3670 : RangeBoundaryVector* range_boundaries =
3671 0 : alloc->newInfallible<RangeBoundaryVector>(*alloc);
3672 :
3673 0 : bool zeroth_entry_is_failure = !cc->is_negated();
3674 :
3675 0 : range_boundaries->reserve(last_valid_range);
3676 0 : for (int i = 0; i <= last_valid_range; i++) {
3677 0 : CharacterRange& range = ranges[i];
3678 0 : if (range.from() == 0) {
3679 3 : MOZ_ASSERT(i == 0);
3680 0 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
3681 : } else {
3682 0 : range_boundaries->append(range.from());
3683 : }
3684 0 : range_boundaries->append(range.to() + 1);
3685 : }
3686 0 : int end_index = range_boundaries->length() - 1;
3687 206 : if ((*range_boundaries)[end_index] > max_char)
3688 0 : end_index--;
3689 :
3690 206 : jit::Label fall_through;
3691 103 : GenerateBranches(macro_assembler,
3692 : *range_boundaries,
3693 : 0, // start_index.
3694 : end_index,
3695 : 0, // min_char.
3696 : max_char,
3697 : &fall_through,
3698 : zeroth_entry_is_failure ? &fall_through : on_failure,
3699 103 : zeroth_entry_is_failure ? on_failure : &fall_through);
3700 103 : macro_assembler->Bind(&fall_through);
3701 : }
3702 :
3703 : typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
3704 : char16_t c,
3705 : jit::Label* on_failure,
3706 : int cp_offset,
3707 : bool check,
3708 : bool preloaded);
3709 :
3710 : static inline bool
3711 20551 : EmitSimpleCharacter(RegExpCompiler* compiler,
3712 : char16_t c,
3713 : jit::Label* on_failure,
3714 : int cp_offset,
3715 : bool check,
3716 : bool preloaded)
3717 : {
3718 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3719 0 : bool bound_checked = false;
3720 0 : if (!preloaded) {
3721 20551 : assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3722 0 : bound_checked = true;
3723 : }
3724 20551 : assembler->CheckNotCharacter(c, on_failure);
3725 20551 : return bound_checked;
3726 : }
3727 :
3728 : // Emit character for case independent match, when GetCaseIndependentLetters
3729 : // returns single character.
3730 : // This is used by the following 2 cases:
3731 : // * non-letters (things that don't have case)
3732 : // * letters that map across Latin1 and non-Latin1, and non-Latin1 case is
3733 : // filtered out because of Latin1 match
3734 : static inline bool
3735 9 : EmitAtomSingle(RegExpCompiler* compiler,
3736 : char16_t c,
3737 : jit::Label* on_failure,
3738 : int cp_offset,
3739 : bool check,
3740 : bool preloaded)
3741 : {
3742 9 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3743 0 : bool latin1 = compiler->latin1();
3744 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
3745 9 : int length = GetCaseIndependentLetters(c, latin1, compiler->unicode(), chars);
3746 9 : if (length != 1)
3747 : return false;
3748 :
3749 0 : bool checked = false;
3750 0 : if (!preloaded) {
3751 4 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3752 0 : checked = check;
3753 : }
3754 4 : macro_assembler->CheckNotCharacter(chars[0], on_failure);
3755 4 : return checked;
3756 : }
3757 :
3758 : static bool
3759 5 : ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
3760 : bool latin1,
3761 : char16_t c1,
3762 : char16_t c2,
3763 : jit::Label* on_failure)
3764 : {
3765 0 : char16_t char_mask = MaximumCharacter(latin1);
3766 :
3767 0 : MOZ_ASSERT(c1 != c2);
3768 0 : if (c1 > c2) {
3769 0 : char16_t tmp = c1;
3770 5 : c1 = c2;
3771 5 : c2 = tmp;
3772 : }
3773 :
3774 0 : char16_t exor = c1 ^ c2;
3775 : // Check whether exor has only one bit set.
3776 0 : if (((exor - 1) & exor) == 0) {
3777 : // If c1 and c2 differ only by one bit.
3778 0 : char16_t mask = char_mask ^ exor;
3779 5 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
3780 5 : return true;
3781 : }
3782 :
3783 0 : char16_t diff = c2 - c1;
3784 0 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
3785 : // If the characters differ by 2^n but don't differ by one bit then
3786 : // subtract the difference from the found character, then do the or
3787 : // trick. We avoid the theoretical case where negative numbers are
3788 : // involved in order to simplify code generation.
3789 0 : char16_t mask = char_mask ^ diff;
3790 0 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
3791 : diff,
3792 : mask,
3793 0 : on_failure);
3794 0 : return true;
3795 : }
3796 : return false;
3797 : }
3798 :
3799 : // Emit character for case independent match, when GetCaseIndependentLetters
3800 : // returns multiple characters.
3801 : static inline bool
3802 9 : EmitAtomMulti(RegExpCompiler* compiler,
3803 : char16_t c,
3804 : jit::Label* on_failure,
3805 : int cp_offset,
3806 : bool check,
3807 : bool preloaded)
3808 : {
3809 9 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3810 0 : bool latin1 = compiler->latin1();
3811 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
3812 9 : int length = GetCaseIndependentLetters(c, latin1, compiler->unicode(), chars);
3813 9 : if (length <= 1) return false;
3814 : // We may not need to check against the end of the input string
3815 : // if this character lies before a character that matched.
3816 0 : if (!preloaded)
3817 5 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
3818 0 : jit::Label ok;
3819 : MOZ_ASSERT(kEcma262UnCanonicalizeMaxWidth == 4);
3820 0 : switch (length) {
3821 : case 2: {
3822 0 : if (ShortCutEmitCharacterPair(macro_assembler,
3823 : latin1,
3824 5 : chars[0],
3825 5 : chars[1],
3826 : on_failure)) {
3827 : } else {
3828 0 : macro_assembler->CheckCharacter(chars[0], &ok);
3829 0 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
3830 0 : macro_assembler->Bind(&ok);
3831 : }
3832 : break;
3833 : }
3834 : case 4:
3835 0 : macro_assembler->CheckCharacter(chars[3], &ok);
3836 : MOZ_FALLTHROUGH;
3837 : case 3:
3838 0 : macro_assembler->CheckCharacter(chars[0], &ok);
3839 0 : macro_assembler->CheckCharacter(chars[1], &ok);
3840 0 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
3841 0 : macro_assembler->Bind(&ok);
3842 0 : break;
3843 : default:
3844 0 : MOZ_CRASH("Bad length");
3845 : }
3846 : return true;
3847 : }
3848 :
3849 : // We call this repeatedly to generate code for each pass over the text node.
3850 : // The passes are in increasing order of difficulty because we hope one
3851 : // of the first passes will fail in which case we are saved the work of the
3852 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3853 : // we will check the '%' in the first pass, the case independent 'a' in the
3854 : // second pass and the character class in the last pass.
3855 : //
3856 : // The passes are done from right to left, so for example to test for /bar/
3857 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3858 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3859 : // bounds check most of the time. In the example we only need to check for
3860 : // end-of-input when loading the putative 'r'.
3861 : //
3862 : // A slight complication involves the fact that the first character may already
3863 : // be fetched into a register by the previous node. In this case we want to
3864 : // do the test for that character first. We do this in separate passes. The
3865 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3866 : // pass has been performed then subsequent passes will have true in
3867 : // first_element_checked to indicate that that character does not need to be
3868 : // checked again.
3869 : //
3870 : // In addition to all this we are passed a Trace, which can
3871 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3872 : // object we can see details of any quick check that was already passed in
3873 : // order to get to the code we are now generating. The quick check can involve
3874 : // loading characters, which means we do not need to recheck the bounds
3875 : // up to the limit the quick check already checked. In addition the quick
3876 : // check can have involved a mask and compare operation which may simplify
3877 : // or obviate the need for further checks at some character positions.
3878 : void
3879 4178 : TextNode::TextEmitPass(RegExpCompiler* compiler,
3880 : TextEmitPassType pass,
3881 : bool preloaded,
3882 : Trace* trace,
3883 : bool first_element_checked,
3884 : int* checked_up_to)
3885 : {
3886 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3887 0 : bool latin1 = compiler->latin1();
3888 0 : jit::Label* backtrack = trace->backtrack();
3889 0 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3890 0 : int element_count = elements().length();
3891 0 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3892 0 : TextElement elm = elements()[i];
3893 0 : int cp_offset = trace->cp_offset() + elm.cp_offset();
3894 0 : if (elm.text_type() == TextElement::ATOM) {
3895 0 : const CharacterVector& quarks = elm.atom()->data();
3896 0 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3897 0 : if (first_element_checked && i == 0 && j == 0) continue;
3898 0 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3899 61679 : EmitCharacterFunction* emit_function = nullptr;
3900 0 : switch (pass) {
3901 : case NON_LATIN1_MATCH:
3902 0 : MOZ_ASSERT(latin1);
3903 0 : if (!IsLatin1Equivalent(quarks[j], compiler)) {
3904 0 : assembler->JumpOrBacktrack(backtrack);
3905 0 : return;
3906 : }
3907 : break;
3908 : case CASE_SINGLE_CHARACTER_MATCH:
3909 9 : emit_function = &EmitAtomSingle;
3910 0 : break;
3911 : case SIMPLE_CHARACTER_MATCH:
3912 20551 : emit_function = &EmitSimpleCharacter;
3913 0 : break;
3914 : case CASE_MUTLI_CHARACTER_MATCH:
3915 9 : emit_function = &EmitAtomMulti;
3916 9 : break;
3917 : default:
3918 : break;
3919 : }
3920 61679 : if (emit_function != nullptr) {
3921 : // emit_function is a function pointer. Suppress static
3922 : // analysis false positives.
3923 0 : JS::AutoSuppressGCAnalysis suppress;
3924 61707 : bool bound_checked = emit_function(compiler,
3925 41138 : quarks[j],
3926 : backtrack,
3927 : cp_offset + j,
3928 0 : *checked_up_to < cp_offset + j,
3929 20569 : preloaded);
3930 20569 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3931 : }
3932 : }
3933 : } else {
3934 0 : MOZ_ASSERT(TextElement::CHAR_CLASS == elm.text_type());
3935 0 : if (pass == CHARACTER_CLASS_MATCH) {
3936 0 : if (first_element_checked && i == 0) continue;
3937 0 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3938 248 : RegExpCharacterClass* cc = elm.char_class();
3939 496 : EmitCharClass(alloc(),
3940 : assembler,
3941 : cc,
3942 : latin1,
3943 : backtrack,
3944 : cp_offset,
3945 248 : *checked_up_to < cp_offset,
3946 248 : preloaded);
3947 : UpdateBoundsCheck(cp_offset, checked_up_to);
3948 : }
3949 : }
3950 : }
3951 : }
3952 :
3953 : int
3954 0 : TextNode::Length()
3955 : {
3956 0 : TextElement elm = elements()[elements().length() - 1];
3957 4473 : MOZ_ASSERT(elm.cp_offset() >= 0);
3958 4473 : return elm.cp_offset() + elm.length();
3959 : }
3960 :
3961 : bool
3962 0 : TextNode::SkipPass(int int_pass, bool ignore_case)
3963 : {
3964 0 : TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3965 0 : if (ignore_case)
3966 252 : return pass == SIMPLE_CHARACTER_MATCH;
3967 5340 : return pass == CASE_SINGLE_CHARACTER_MATCH || pass == CASE_MUTLI_CHARACTER_MATCH;
3968 : }
3969 :
3970 : // This generates the code to match a text node. A text node can contain
3971 : // straight character sequences (possibly to be matched in a case-independent
3972 : // way) and character classes. For efficiency we do not do this in a single
3973 : // pass from left to right. Instead we pass over the text node several times,
3974 : // emitting code for some character positions every time. See the comment on
3975 : // TextEmitPass for details.
3976 : void
3977 0 : TextNode::Emit(RegExpCompiler* compiler, Trace* trace)
3978 : {
3979 0 : LimitResult limit_result = LimitVersions(compiler, trace);
3980 1392 : if (limit_result == DONE) return;
3981 0 : MOZ_ASSERT(limit_result == CONTINUE);
3982 :
3983 1364 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3984 0 : compiler->SetRegExpTooBig();
3985 : return;
3986 : }
3987 :
3988 0 : if (compiler->latin1()) {
3989 1319 : int dummy = 0;
3990 1319 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3991 : }
3992 :
3993 0 : bool first_elt_done = false;
3994 1364 : int bound_checked_to = trace->cp_offset() - 1;
3995 1364 : bound_checked_to += trace->bound_checked_up_to();
3996 :
3997 : // If a character is preloaded into the current character register then
3998 : // check that now.
3999 0 : if (trace->characters_preloaded() == 1) {
4000 306 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
4001 272 : if (!SkipPass(pass, compiler->ignore_case())) {
4002 : TextEmitPass(compiler,
4003 : static_cast<TextEmitPassType>(pass),
4004 : true,
4005 : trace,
4006 : false,
4007 74 : &bound_checked_to);
4008 : }
4009 : }
4010 : first_elt_done = true;
4011 : }
4012 :
4013 0 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
4014 10912 : if (!SkipPass(pass, compiler->ignore_case())) {
4015 2785 : TextEmitPass(compiler,
4016 : static_cast<TextEmitPassType>(pass),
4017 : false,
4018 : trace,
4019 : first_elt_done,
4020 2785 : &bound_checked_to);
4021 : }
4022 : }
4023 :
4024 0 : Trace successor_trace(*trace);
4025 0 : successor_trace.set_at_start(false);
4026 0 : successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
4027 2728 : RecursionCheck rc(compiler);
4028 1364 : on_success()->Emit(compiler, &successor_trace);
4029 : }
4030 :
4031 : void
4032 0 : LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4033 : {
4034 647 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4035 0 : if (trace->stop_node() == this) {
4036 : int text_length =
4037 204 : GreedyLoopTextLengthForAlternative(&alternatives()[0]);
4038 68 : MOZ_ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
4039 : // Update the counter-based backtracking info on the stack. This is an
4040 : // optimization for greedy loops (see below).
4041 0 : MOZ_ASSERT(trace->cp_offset() == text_length);
4042 0 : macro_assembler->AdvanceCurrentPosition(text_length);
4043 68 : macro_assembler->JumpOrBacktrack(trace->loop_label());
4044 0 : return;
4045 : }
4046 0 : MOZ_ASSERT(trace->stop_node() == nullptr);
4047 0 : if (!trace->is_trivial()) {
4048 270 : trace->Flush(compiler, this);
4049 0 : return;
4050 : }
4051 309 : ChoiceNode::Emit(compiler, trace);
4052 : }
4053 :
4054 : /* Code generation for choice nodes.
4055 : *
4056 : * We generate quick checks that do a mask and compare to eliminate a
4057 : * choice. If the quick check succeeds then it jumps to the continuation to
4058 : * do slow checks and check subsequent nodes. If it fails (the common case)
4059 : * it falls through to the next choice.
4060 : *
4061 : * Here is the desired flow graph. Nodes directly below each other imply
4062 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
4063 : * 3 doesn't have a quick check so we have to call the slow check.
4064 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
4065 : * regexp continuation is generated directly after the Sn node, up to the
4066 : * next JumpOrBacktrack if we decide to reuse some already generated code. Some
4067 : * nodes expect preload_characters to be preloaded into the current
4068 : * character register. R nodes do this preloading. Vertices are marked
4069 : * F for failures and S for success (possible success in the case of quick
4070 : * nodes). L, V, < and > are used as arrow heads.
4071 : *
4072 : * ----------> R
4073 : * |
4074 : * V
4075 : * Q1 -----> S1
4076 : * | S /
4077 : * F| /
4078 : * | F/
4079 : * | /
4080 : * | R
4081 : * | /
4082 : * V L
4083 : * Q2 -----> S2
4084 : * | S /
4085 : * F| /
4086 : * | F/
4087 : * | /
4088 : * | R
4089 : * | /
4090 : * V L
4091 : * S3
4092 : * |
4093 : * F|
4094 : * |
4095 : * R
4096 : * |
4097 : * backtrack V
4098 : * <----------Q4
4099 : * \ F |
4100 : * \ |S
4101 : * \ F V
4102 : * \-----S4
4103 : *
4104 : * For greedy loops we reverse our expectation and expect to match rather
4105 : * than fail. Therefore we want the loop code to look like this (U is the
4106 : * unwind code that steps back in the greedy loop). The following alternatives
4107 : * look the same as above.
4108 : * _____
4109 : * / \
4110 : * V |
4111 : * ----------> S1 |
4112 : * /| |
4113 : * / |S |
4114 : * F/ \_____/
4115 : * /
4116 : * |<-----------
4117 : * | \
4118 : * V \
4119 : * Q2 ---> S2 \
4120 : * | S / |
4121 : * F| / |
4122 : * | F/ |
4123 : * | / |
4124 : * | R |
4125 : * | / |
4126 : * F VL |
4127 : * <------U |
4128 : * back |S |
4129 : * \______________/
4130 : */
4131 :
4132 : // This class is used when generating the alternatives in a choice node. It
4133 : // records the way the alternative is being code generated.
4134 3983 : class irregexp::AlternativeGeneration
4135 : {
4136 : public:
4137 : AlternativeGeneration()
4138 3983 : : possible_success(),
4139 : expects_preload(false),
4140 : after(),
4141 15932 : quick_check_details()
4142 : {}
4143 :
4144 : jit::Label possible_success;
4145 : bool expects_preload;
4146 : jit::Label after;
4147 : QuickCheckDetails quick_check_details;
4148 : };
4149 :
4150 : void
4151 47 : ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
4152 : Guard* guard, Trace* trace)
4153 : {
4154 0 : switch (guard->op()) {
4155 : case Guard::LT:
4156 60 : MOZ_ASSERT(!trace->mentions_reg(guard->reg()));
4157 0 : macro_assembler->IfRegisterGE(guard->reg(),
4158 : guard->value(),
4159 60 : trace->backtrack());
4160 0 : break;
4161 : case Guard::GEQ:
4162 34 : MOZ_ASSERT(!trace->mentions_reg(guard->reg()));
4163 0 : macro_assembler->IfRegisterLT(guard->reg(),
4164 : guard->value(),
4165 34 : trace->backtrack());
4166 0 : break;
4167 : }
4168 47 : }
4169 :
4170 : int
4171 0 : ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least)
4172 : {
4173 0 : int preload_characters = Min(4, eats_at_least);
4174 0 : if (compiler->macro_assembler()->CanReadUnaligned()) {
4175 0 : bool latin1 = compiler->latin1();
4176 1 : if (latin1) {
4177 354 : if (preload_characters > 4)
4178 0 : preload_characters = 4;
4179 : // We can't preload 3 characters because there is no machine instruction
4180 : // to do that. We can't just load 4 because we could be reading
4181 : // beyond the end of the string, which could cause a memory fault.
4182 354 : if (preload_characters == 3)
4183 0 : preload_characters = 2;
4184 : } else {
4185 24 : if (preload_characters > 2)
4186 4 : preload_characters = 2;
4187 : }
4188 : } else {
4189 0 : if (preload_characters > 1)
4190 0 : preload_characters = 1;
4191 : }
4192 378 : return preload_characters;
4193 : }
4194 :
4195 : RegExpNode*
4196 0 : TextNode::GetSuccessorOfOmnivorousTextNode(RegExpCompiler* compiler)
4197 : {
4198 254 : if (elements().length() != 1)
4199 : return nullptr;
4200 :
4201 252 : TextElement elm = elements()[0];
4202 126 : if (elm.text_type() != TextElement::CHAR_CLASS)
4203 : return nullptr;
4204 :
4205 82 : RegExpCharacterClass* node = elm.char_class();
4206 0 : CharacterRangeVector& ranges = node->ranges(alloc());
4207 :
4208 82 : if (!CharacterRange::IsCanonical(ranges))
4209 0 : CharacterRange::Canonicalize(ranges);
4210 :
4211 82 : if (node->is_negated())
4212 0 : return ranges.length() == 0 ? on_success() : nullptr;
4213 :
4214 81 : if (ranges.length() != 1)
4215 : return nullptr;
4216 :
4217 154 : uint32_t max_char = MaximumCharacter(compiler->latin1());
4218 231 : return ranges[0].IsEverything(max_char) ? on_success() : nullptr;
4219 : }
4220 :
4221 : // Finds the fixed match length of a sequence of nodes that goes from
4222 : // this alternative and back to this choice node. If there are variable
4223 : // length nodes or other complications in the way then return a sentinel
4224 : // value indicating that a greedy loop cannot be constructed.
4225 : int
4226 0 : ChoiceNode::GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative)
4227 : {
4228 446 : int length = 0;
4229 446 : RegExpNode* node = alternative->node();
4230 : // Later we will generate code for all these text nodes using recursion
4231 : // so we have to limit the max number.
4232 0 : int recursion_depth = 0;
4233 743 : while (node != this) {
4234 607 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
4235 : return kNodeIsTooComplexForGreedyLoops;
4236 : }
4237 607 : int node_length = node->GreedyLoopTextLength();
4238 607 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
4239 : return kNodeIsTooComplexForGreedyLoops;
4240 : }
4241 0 : length += node_length;
4242 297 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
4243 297 : node = seq_node->on_success();
4244 : }
4245 : return length;
4246 : }
4247 :
4248 : // Creates a list of AlternativeGenerations. If the list has a reasonable
4249 : // size then it is on the stack, otherwise the excess is on the heap.
4250 : class AlternativeGenerationList
4251 : {
4252 : public:
4253 378 : AlternativeGenerationList(LifoAlloc* alloc, size_t count)
4254 0 : : alt_gens_(*alloc)
4255 : {
4256 0 : alt_gens_.reserve(count);
4257 0 : for (size_t i = 0; i < count && i < kAFew; i++)
4258 0 : alt_gens_.append(a_few_alt_gens_ + i);
4259 0 : for (size_t i = kAFew; i < count; i++) {
4260 0 : AutoEnterOOMUnsafeRegion oomUnsafe;
4261 1 : AlternativeGeneration* gen = js_new<AlternativeGeneration>();
4262 0 : if (!gen)
4263 0 : oomUnsafe.crash("AlternativeGenerationList js_new");
4264 0 : alt_gens_.append(gen);
4265 : }
4266 0 : }
4267 :
4268 0 : ~AlternativeGenerationList() {
4269 0 : for (size_t i = kAFew; i < alt_gens_.length(); i++) {
4270 406 : js_delete(alt_gens_[i]);
4271 0 : alt_gens_[i] = nullptr;
4272 : }
4273 378 : }
4274 :
4275 : AlternativeGeneration* at(int i) {
4276 6324 : return alt_gens_[i];
4277 : }
4278 :
4279 : private:
4280 : static const size_t kAFew = 10;
4281 : InfallibleVector<AlternativeGeneration*, 1> alt_gens_;
4282 : AlternativeGeneration a_few_alt_gens_[kAFew];
4283 : };
4284 :
4285 : void
4286 0 : ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4287 : {
4288 526 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4289 0 : size_t choice_count = alternatives().length();
4290 : #ifdef DEBUG
4291 0 : for (size_t i = 0; i < choice_count - 1; i++) {
4292 0 : const GuardedAlternative& alternative = alternatives()[i];
4293 0 : const GuardVector* guards = alternative.guards();
4294 0 : if (guards) {
4295 240 : for (size_t j = 0; j < guards->length(); j++)
4296 160 : MOZ_ASSERT(!trace->mentions_reg((*guards)[j]->reg()));
4297 : }
4298 : }
4299 : #endif
4300 :
4301 0 : LimitResult limit_result = LimitVersions(compiler, trace);
4302 674 : if (limit_result == DONE) return;
4303 0 : MOZ_ASSERT(limit_result == CONTINUE);
4304 :
4305 0 : int new_flush_budget = trace->flush_budget() / choice_count;
4306 0 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
4307 0 : trace->Flush(compiler, this);
4308 0 : return;
4309 : }
4310 :
4311 0 : RecursionCheck rc(compiler);
4312 :
4313 0 : Trace* current_trace = trace;
4314 :
4315 0 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives()[0]);
4316 0 : bool greedy_loop = false;
4317 0 : jit::Label greedy_loop_label;
4318 0 : Trace counter_backtrack_trace;
4319 756 : counter_backtrack_trace.set_backtrack(&greedy_loop_label);
4320 0 : if (not_at_start()) counter_backtrack_trace.set_at_start(false);
4321 :
4322 378 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4323 : // Here we have special handling for greedy loops containing only text nodes
4324 : // and other simple nodes. These are handled by pushing the current
4325 : // position on the stack and then incrementing the current position each
4326 : // time around the switch. On backtrack we decrement the current position
4327 : // and check it against the pushed value. This avoids pushing backtrack
4328 : // information for each iteration of the loop, which could take up a lot of
4329 : // space.
4330 0 : greedy_loop = true;
4331 0 : MOZ_ASSERT(trace->stop_node() == nullptr);
4332 0 : macro_assembler->PushCurrentPosition();
4333 0 : current_trace = &counter_backtrack_trace;
4334 0 : jit::Label greedy_match_failed;
4335 0 : Trace greedy_match_trace;
4336 0 : if (not_at_start()) greedy_match_trace.set_at_start(false);
4337 0 : greedy_match_trace.set_backtrack(&greedy_match_failed);
4338 0 : jit::Label loop_label;
4339 0 : macro_assembler->Bind(&loop_label);
4340 0 : greedy_match_trace.set_stop_node(this);
4341 0 : greedy_match_trace.set_loop_label(&loop_label);
4342 136 : alternatives()[0].node()->Emit(compiler, &greedy_match_trace);
4343 68 : macro_assembler->Bind(&greedy_match_failed);
4344 : }
4345 :
4346 756 : jit::Label second_choice; // For use in greedy matches.
4347 0 : macro_assembler->Bind(&second_choice);
4348 :
4349 0 : size_t first_normal_choice = greedy_loop ? 1 : 0;
4350 :
4351 0 : bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
4352 378 : const int kEatsAtLeastNotYetInitialized = -1;
4353 0 : int eats_at_least = kEatsAtLeastNotYetInitialized;
4354 :
4355 0 : bool skip_was_emitted = false;
4356 :
4357 0 : if (!greedy_loop && choice_count == 2) {
4358 0 : GuardedAlternative alt1 = alternatives()[1];
4359 0 : if (!alt1.guards() || alt1.guards()->length() == 0) {
4360 182 : RegExpNode* eats_anything_node = alt1.node();
4361 182 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == this) {
4362 : // At this point we know that we are at a non-greedy loop that will eat
4363 : // any character one at a time. Any non-anchored regexp has such a
4364 : // loop prepended to it in order to find where it starts. We look for
4365 : // a pattern of the form ...abc... where we can look 6 characters ahead
4366 : // and step forwards 3 if the character is not one of abc. Abc need
4367 : // not be atoms, they can be any reasonably limited character class or
4368 : // small alternation.
4369 0 : MOZ_ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
4370 0 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4371 58 : if (lookahead == nullptr) {
4372 116 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4373 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4374 : kRecursionBudget,
4375 58 : not_at_start));
4376 0 : if (eats_at_least >= 1) {
4377 : BoyerMooreLookahead* bm =
4378 0 : alloc()->newInfallible<BoyerMooreLookahead>(alloc(), eats_at_least, compiler);
4379 0 : GuardedAlternative alt0 = alternatives()[0];
4380 58 : alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
4381 58 : skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4382 : }
4383 : } else {
4384 0 : skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
4385 : }
4386 : }
4387 : }
4388 : }
4389 :
4390 0 : if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4391 : // Save some time by looking at most one machine word ahead.
4392 320 : eats_at_least =
4393 0 : EatsAtLeast(compiler->latin1() ? 4 : 2, kRecursionBudget, not_at_start);
4394 : }
4395 0 : int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4396 :
4397 0 : bool preload_is_current = !skip_was_emitted &&
4398 721 : (current_trace->characters_preloaded() == preload_characters);
4399 0 : bool preload_has_checked_bounds = preload_is_current;
4400 :
4401 756 : AlternativeGenerationList alt_gens(alloc(), choice_count);
4402 :
4403 : // For now we just call all choices one after the other. The idea ultimately
4404 : // is to use the Dispatch table to try only the relevant ones.
4405 0 : for (size_t i = first_normal_choice; i < choice_count; i++) {
4406 0 : GuardedAlternative alternative = alternatives()[i];
4407 0 : AlternativeGeneration* alt_gen = alt_gens.at(i);
4408 0 : alt_gen->quick_check_details.set_characters(preload_characters);
4409 0 : const GuardVector* guards = alternative.guards();
4410 1306 : Trace new_trace(*current_trace);
4411 0 : new_trace.set_characters_preloaded(preload_is_current ?
4412 : preload_characters :
4413 0 : 0);
4414 1306 : if (preload_has_checked_bounds) {
4415 0 : new_trace.set_bound_checked_up_to(preload_characters);
4416 : }
4417 0 : new_trace.quick_check_performed()->Clear();
4418 0 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4419 0 : alt_gen->expects_preload = preload_is_current;
4420 0 : bool generate_full_check_inline = false;
4421 2611 : if (try_to_emit_quick_check_for_alternative(i) &&
4422 1305 : alternative.node()->EmitQuickCheck(compiler,
4423 : &new_trace,
4424 : preload_has_checked_bounds,
4425 : &alt_gen->possible_success,
4426 : &alt_gen->quick_check_details,
4427 : i < choice_count - 1)) {
4428 : // Quick check was generated for this choice.
4429 1084 : preload_is_current = true;
4430 1084 : preload_has_checked_bounds = true;
4431 : // On the last choice in the ChoiceNode we generated the quick
4432 : // check to fall through on possible success. So now we need to
4433 : // generate the full check inline.
4434 0 : if (i == choice_count - 1) {
4435 0 : macro_assembler->Bind(&alt_gen->possible_success);
4436 0 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4437 0 : new_trace.set_characters_preloaded(preload_characters);
4438 448 : new_trace.set_bound_checked_up_to(preload_characters);
4439 0 : generate_full_check_inline = true;
4440 : }
4441 0 : } else if (alt_gen->quick_check_details.cannot_match()) {
4442 19 : if (i == choice_count - 1 && !greedy_loop) {
4443 0 : macro_assembler->JumpOrBacktrack(trace->backtrack());
4444 : }
4445 19 : continue;
4446 : } else {
4447 : // No quick check was generated. Put the full code here.
4448 : // If this is not the first choice then there could be slow checks from
4449 : // previous cases that go here when they fail. There's no reason to
4450 : // insist that they preload characters since the slow check we are about
4451 : // to generate probably can't use it.
4452 203 : if (i != first_normal_choice) {
4453 126 : alt_gen->expects_preload = false;
4454 : new_trace.InvalidateCurrentCharacter();
4455 : }
4456 203 : if (i < choice_count - 1) {
4457 49 : new_trace.set_backtrack(&alt_gen->after);
4458 : }
4459 : generate_full_check_inline = true;
4460 : }
4461 0 : if (generate_full_check_inline) {
4462 0 : if (new_trace.actions() != nullptr)
4463 0 : new_trace.set_flush_budget(new_flush_budget);
4464 0 : if (guards) {
4465 57 : for (size_t j = 0; j < guards->length(); j++)
4466 0 : GenerateGuard(macro_assembler, (*guards)[j], &new_trace);
4467 : }
4468 427 : alternative.node()->Emit(compiler, &new_trace);
4469 0 : preload_is_current = false;
4470 : }
4471 0 : macro_assembler->Bind(&alt_gen->after);
4472 : }
4473 378 : if (greedy_loop) {
4474 0 : macro_assembler->Bind(&greedy_loop_label);
4475 : // If we have unwound to the bottom then backtrack.
4476 0 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4477 : // Otherwise try the second priority at an earlier position.
4478 68 : macro_assembler->AdvanceCurrentPosition(-text_length);
4479 68 : macro_assembler->JumpOrBacktrack(&second_choice);
4480 : }
4481 :
4482 : // At this point we need to generate slow checks for the alternatives where
4483 : // the quick check was inlined. We can recognize these because the associated
4484 : // label was bound.
4485 0 : for (size_t i = first_normal_choice; i < choice_count - 1; i++) {
4486 1856 : AlternativeGeneration* alt_gen = alt_gens.at(i);
4487 928 : Trace new_trace(*current_trace);
4488 : // If there are actions to be flushed we have to limit how many times
4489 : // they are flushed. Take the budget of the parent trace and distribute
4490 : // it fairly amongst the children.
4491 928 : if (new_trace.actions() != nullptr) {
4492 0 : new_trace.set_flush_budget(new_flush_budget);
4493 : }
4494 0 : EmitOutOfLineContinuation(compiler,
4495 : &new_trace,
4496 1856 : alternatives()[i],
4497 : alt_gen,
4498 : preload_characters,
4499 2784 : alt_gens.at(i + 1)->expects_preload);
4500 : }
4501 : }
4502 :
4503 : void
4504 928 : ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4505 : Trace* trace,
4506 : GuardedAlternative alternative,
4507 : AlternativeGeneration* alt_gen,
4508 : int preload_characters,
4509 : bool next_expects_preload)
4510 : {
4511 1856 : if (!alt_gen->possible_success.used())
4512 0 : return;
4513 :
4514 0 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4515 0 : macro_assembler->Bind(&alt_gen->possible_success);
4516 0 : Trace out_of_line_trace(*trace);
4517 0 : out_of_line_trace.set_characters_preloaded(preload_characters);
4518 0 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4519 0 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4520 0 : const GuardVector* guards = alternative.guards();
4521 0 : if (next_expects_preload) {
4522 0 : jit::Label reload_current_char;
4523 0 : out_of_line_trace.set_backtrack(&reload_current_char);
4524 0 : if (guards) {
4525 84 : for (size_t j = 0; j < guards->length(); j++)
4526 0 : GenerateGuard(macro_assembler, (*guards)[j], &out_of_line_trace);
4527 : }
4528 778 : alternative.node()->Emit(compiler, &out_of_line_trace);
4529 778 : macro_assembler->Bind(&reload_current_char);
4530 : // Reload the current character, since the next quick check expects that.
4531 : // We don't need to check bounds here because we only get into this
4532 : // code through a quick check which already did the checked load.
4533 778 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4534 : nullptr,
4535 : false,
4536 1556 : preload_characters);
4537 0 : macro_assembler->JumpOrBacktrack(&(alt_gen->after));
4538 : } else {
4539 0 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4540 0 : if (guards) {
4541 0 : for (size_t j = 0; j < guards->length(); j++)
4542 0 : GenerateGuard(macro_assembler, (*guards)[j], &out_of_line_trace);
4543 : }
4544 82 : alternative.node()->Emit(compiler, &out_of_line_trace);
4545 : }
4546 : }
4547 :
4548 : void
4549 0 : ActionNode::Emit(RegExpCompiler* compiler, Trace* trace)
4550 : {
4551 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4552 0 : LimitResult limit_result = LimitVersions(compiler, trace);
4553 2735 : if (limit_result == DONE) return;
4554 0 : MOZ_ASSERT(limit_result == CONTINUE);
4555 :
4556 0 : RecursionCheck rc(compiler);
4557 :
4558 1997 : switch (action_type_) {
4559 : case STORE_POSITION: {
4560 : Trace::DeferredCapture
4561 : new_capture(data_.u_position_register.reg,
4562 0 : data_.u_position_register.is_capture,
4563 0 : trace);
4564 0 : Trace new_trace = *trace;
4565 1848 : new_trace.add_action(&new_capture);
4566 1848 : on_success()->Emit(compiler, &new_trace);
4567 : break;
4568 : }
4569 : case INCREMENT_REGISTER: {
4570 : Trace::DeferredIncrementRegister
4571 0 : new_increment(data_.u_increment_register.reg);
4572 0 : Trace new_trace = *trace;
4573 43 : new_trace.add_action(&new_increment);
4574 43 : on_success()->Emit(compiler, &new_trace);
4575 : break;
4576 : }
4577 : case SET_REGISTER: {
4578 : Trace::DeferredSetRegister
4579 0 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4580 0 : Trace new_trace = *trace;
4581 39 : new_trace.add_action(&new_set);
4582 39 : on_success()->Emit(compiler, &new_trace);
4583 : break;
4584 : }
4585 : case CLEAR_CAPTURES: {
4586 : Trace::DeferredClearCaptures
4587 : new_capture(Interval(data_.u_clear_captures.range_from,
4588 0 : data_.u_clear_captures.range_to));
4589 0 : Trace new_trace = *trace;
4590 15 : new_trace.add_action(&new_capture);
4591 15 : on_success()->Emit(compiler, &new_trace);
4592 : break;
4593 : }
4594 : case BEGIN_SUBMATCH:
4595 22 : if (!trace->is_trivial()) {
4596 0 : trace->Flush(compiler, this);
4597 : } else {
4598 0 : assembler->WriteCurrentPositionToRegister(data_.u_submatch.current_position_register, 0);
4599 11 : assembler->WriteBacktrackStackPointerToRegister(data_.u_submatch.stack_pointer_register);
4600 11 : on_success()->Emit(compiler, trace);
4601 : }
4602 : break;
4603 : case EMPTY_MATCH_CHECK: {
4604 0 : int start_pos_reg = data_.u_empty_match_check.start_register;
4605 0 : int stored_pos = 0;
4606 0 : int rep_reg = data_.u_empty_match_check.repetition_register;
4607 0 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4608 0 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4609 0 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4610 : // If we know we haven't advanced and there is no minimum we
4611 : // can just backtrack immediately.
4612 0 : assembler->JumpOrBacktrack(trace->backtrack());
4613 0 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4614 : // If we know we've advanced we can generate the continuation
4615 : // immediately.
4616 0 : on_success()->Emit(compiler, trace);
4617 0 : } else if (!trace->is_trivial()) {
4618 0 : trace->Flush(compiler, this);
4619 : } else {
4620 0 : jit::Label skip_empty_check;
4621 : // If we have a minimum number of repetitions we check the current
4622 : // number first and skip the empty check if it's not enough.
4623 0 : if (has_minimum) {
4624 0 : int limit = data_.u_empty_match_check.repetition_limit;
4625 0 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4626 : }
4627 : // If the match is empty we bail out, otherwise we fall through
4628 : // to the on-success continuation.
4629 0 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4630 0 : trace->backtrack());
4631 0 : assembler->Bind(&skip_empty_check);
4632 0 : on_success()->Emit(compiler, trace);
4633 : }
4634 : break;
4635 : }
4636 : case POSITIVE_SUBMATCH_SUCCESS: {
4637 0 : if (!trace->is_trivial()) {
4638 20 : trace->Flush(compiler, this);
4639 0 : return;
4640 : }
4641 0 : assembler->ReadCurrentPositionFromRegister(data_.u_submatch.current_position_register);
4642 0 : assembler->ReadBacktrackStackPointerFromRegister(data_.u_submatch.stack_pointer_register);
4643 0 : int clear_register_count = data_.u_submatch.clear_register_count;
4644 0 : if (clear_register_count == 0) {
4645 10 : on_success()->Emit(compiler, trace);
4646 0 : return;
4647 : }
4648 0 : int clear_registers_from = data_.u_submatch.clear_register_from;
4649 0 : jit::Label clear_registers_backtrack;
4650 0 : Trace new_trace = *trace;
4651 0 : new_trace.set_backtrack(&clear_registers_backtrack);
4652 0 : on_success()->Emit(compiler, &new_trace);
4653 :
4654 0 : assembler->Bind(&clear_registers_backtrack);
4655 0 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4656 0 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4657 :
4658 0 : MOZ_ASSERT(trace->backtrack() == nullptr);
4659 0 : assembler->Backtrack();
4660 : return;
4661 : }
4662 : default:
4663 0 : MOZ_CRASH("Bad action");
4664 : }
4665 : }
4666 :
4667 : void
4668 0 : BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace)
4669 : {
4670 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4671 0 : if (!trace->is_trivial()) {
4672 0 : trace->Flush(compiler, this);
4673 0 : return;
4674 : }
4675 :
4676 0 : LimitResult limit_result = LimitVersions(compiler, trace);
4677 0 : if (limit_result == DONE) return;
4678 0 : MOZ_ASSERT(limit_result == CONTINUE);
4679 :
4680 0 : RecursionCheck rc(compiler);
4681 :
4682 0 : MOZ_ASSERT(start_reg_ + 1 == end_reg_);
4683 0 : if (compiler->ignore_case()) {
4684 0 : assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4685 : trace->backtrack(),
4686 0 : compiler->unicode());
4687 : } else {
4688 0 : assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4689 : }
4690 0 : on_success()->Emit(compiler, trace);
4691 : }
4692 :
4693 : RegExpNode::LimitResult
4694 4639 : RegExpNode::LimitVersions(RegExpCompiler* compiler, Trace* trace)
4695 : {
4696 : // If we are generating a greedy loop then don't stop and don't reuse code.
4697 4639 : if (trace->stop_node() != nullptr)
4698 : return CONTINUE;
4699 :
4700 0 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4701 4571 : if (trace->is_trivial()) {
4702 1780 : if (label()->bound()) {
4703 : // We are being asked to generate a generic version, but that's already
4704 : // been done so just go to it.
4705 1012 : macro_assembler->JumpOrBacktrack(label());
4706 0 : return DONE;
4707 : }
4708 384 : if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
4709 : // To avoid too deep recursion we push the node to the work queue and just
4710 : // generate a goto here.
4711 0 : compiler->AddWork(this);
4712 0 : macro_assembler->JumpOrBacktrack(label());
4713 0 : return DONE;
4714 : }
4715 : // Generate generic version of the node and bind the label for later use.
4716 768 : macro_assembler->Bind(label());
4717 384 : return CONTINUE;
4718 : }
4719 :
4720 : // We are being asked to make a non-generic version. Keep track of how many
4721 : // non-generic versions we generate so as not to overdo it.
4722 0 : trace_count_++;
4723 6968 : if (trace_count_ < kMaxCopiesCodeGenerated &&
4724 3287 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
4725 : return CONTINUE;
4726 : }
4727 :
4728 : // If we get here code has been generated for this node too many times or
4729 : // recursion is too deep. Time to switch to a generic version. The code for
4730 : // generic versions above can handle deep recursion properly.
4731 394 : trace->Flush(compiler, this);
4732 394 : return DONE;
4733 : }
4734 :
4735 : bool
4736 1305 : RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
4737 : Trace* trace,
4738 : bool preload_has_checked_bounds,
4739 : jit::Label* on_possible_success,
4740 : QuickCheckDetails* details,
4741 : bool fall_through_on_failure)
4742 : {
4743 0 : if (details->characters() == 0) return false;
4744 0 : GetQuickCheckDetails(
4745 0 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
4746 0 : if (details->cannot_match()) return false;
4747 1185 : if (!details->Rationalize(compiler->latin1())) return false;
4748 0 : MOZ_ASSERT(details->characters() == 1 ||
4749 : compiler->macro_assembler()->CanReadUnaligned());
4750 1084 : uint32_t mask = details->mask();
4751 0 : uint32_t value = details->value();
4752 :
4753 0 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4754 :
4755 1084 : if (trace->characters_preloaded() != details->characters()) {
4756 0 : assembler->LoadCurrentCharacter(trace->cp_offset(),
4757 : trace->backtrack(),
4758 258 : !preload_has_checked_bounds,
4759 516 : details->characters());
4760 : }
4761 :
4762 0 : bool need_mask = true;
4763 :
4764 1084 : if (details->characters() == 1) {
4765 : // If number of characters preloaded is 1 then we used a byte or 16 bit
4766 : // load so the value is already masked down.
4767 76 : uint32_t char_mask = MaximumCharacter(compiler->latin1());
4768 38 : if ((mask & char_mask) == char_mask) need_mask = false;
4769 : mask &= char_mask;
4770 : } else {
4771 : // For 2-character preloads in LATIN1 mode or 1-character preloads in
4772 : // TWO_BYTE mode we also use a 16 bit load with zero extend.
4773 0 : if (details->characters() == 2 && compiler->latin1()) {
4774 149 : if ((mask & 0xffff) == 0xffff) need_mask = false;
4775 897 : } else if (details->characters() == 1 && !compiler->latin1()) {
4776 : if ((mask & 0xffff) == 0xffff) need_mask = false;
4777 : } else {
4778 897 : if (mask == 0xffffffff) need_mask = false;
4779 : }
4780 : }
4781 :
4782 0 : if (fall_through_on_failure) {
4783 860 : if (need_mask) {
4784 0 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
4785 : } else {
4786 766 : assembler->CheckCharacter(value, on_possible_success);
4787 : }
4788 : } else {
4789 224 : if (need_mask) {
4790 0 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
4791 : } else {
4792 179 : assembler->CheckNotCharacter(value, trace->backtrack());
4793 : }
4794 : }
4795 : return true;
4796 : }
4797 :
4798 : bool
4799 419 : TextNode::FillInBMInfo(int initial_offset,
4800 : int budget,
4801 : BoyerMooreLookahead* bm,
4802 : bool not_at_start)
4803 : {
4804 419 : if (!bm->CheckOverRecursed())
4805 : return false;
4806 :
4807 419 : if (initial_offset >= bm->length())
4808 : return true;
4809 :
4810 0 : int offset = initial_offset;
4811 0 : int max_char = bm->max_char();
4812 0 : for (size_t i = 0; i < elements().length(); i++) {
4813 0 : if (offset >= bm->length()) {
4814 0 : if (initial_offset == 0)
4815 0 : set_bm_info(not_at_start, bm);
4816 0 : return true;
4817 : }
4818 0 : TextElement text = elements()[i];
4819 0 : if (text.text_type() == TextElement::ATOM) {
4820 0 : RegExpAtom* atom = text.atom();
4821 0 : for (int j = 0; j < atom->length(); j++, offset++) {
4822 0 : if (offset >= bm->length()) {
4823 304 : if (initial_offset == 0)
4824 279 : set_bm_info(not_at_start, bm);
4825 : return true;
4826 : }
4827 4942 : char16_t character = atom->data()[j];
4828 0 : if (bm->compiler()->ignore_case()) {
4829 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
4830 0 : int length = GetCaseIndependentLetters(character,
4831 0 : bm->max_char() == kMaxOneByteCharCode,
4832 0 : bm->compiler()->unicode(),
4833 0 : chars);
4834 27 : for (int j = 0; j < length; j++)
4835 0 : bm->Set(offset, chars[j]);
4836 : } else {
4837 2462 : if (character <= max_char) bm->Set(offset, character);
4838 : }
4839 : }
4840 : } else {
4841 0 : MOZ_ASSERT(TextElement::CHAR_CLASS == text.text_type());
4842 0 : RegExpCharacterClass* char_class = text.char_class();
4843 108 : const CharacterRangeVector& ranges = char_class->ranges(alloc());
4844 54 : if (char_class->is_negated()) {
4845 : bm->SetAll(offset);
4846 : } else {
4847 0 : for (size_t k = 0; k < ranges.length(); k++) {
4848 214 : const CharacterRange& range = ranges[k];
4849 0 : if (range.from() > max_char)
4850 : continue;
4851 280 : int to = Min(max_char, static_cast<int>(range.to()));
4852 280 : bm->SetInterval(offset, Interval(range.from(), to));
4853 : }
4854 : }
4855 54 : offset++;
4856 : }
4857 : }
4858 114 : if (offset >= bm->length()) {
4859 62 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
4860 : return true;
4861 : }
4862 104 : if (!on_success()->FillInBMInfo(offset,
4863 : budget - 1,
4864 : bm,
4865 0 : true)) // Not at start after a text node.
4866 : return false;
4867 52 : if (initial_offset == 0)
4868 22 : set_bm_info(not_at_start, bm);
4869 : return true;
4870 : }
4871 :
4872 : // -------------------------------------------------------------------
4873 : // QuickCheckDetails
4874 :
4875 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
4876 : static inline uint32_t
4877 : SmearBitsRight(uint32_t v)
4878 : {
4879 0 : v |= v >> 1;
4880 0 : v |= v >> 2;
4881 0 : v |= v >> 4;
4882 506 : v |= v >> 8;
4883 506 : v |= v >> 16;
4884 : return v;
4885 : }
4886 :
4887 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
4888 : // super-class in the .h file).
4889 : //
4890 : // We iterate along the text object, building up for each character a
4891 : // mask and value that can be used to test for a quick failure to match.
4892 : // The masks and values for the positions will be combined into a single
4893 : // machine word for the current character width in order to be used in
4894 : // generating a quick check.
4895 : void
4896 2320 : TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
4897 : RegExpCompiler* compiler,
4898 : int characters_filled_in,
4899 : bool not_at_start)
4900 : {
4901 0 : MOZ_ASSERT(characters_filled_in < details->characters());
4902 2320 : int characters = details->characters();
4903 0 : int char_mask = MaximumCharacter(compiler->latin1());
4904 :
4905 0 : for (size_t k = 0; k < elements().length(); k++) {
4906 0 : TextElement elm = elements()[k];
4907 0 : if (elm.text_type() == TextElement::ATOM) {
4908 2028 : const CharacterVector& quarks = elm.atom()->data();
4909 0 : for (size_t i = 0; i < (size_t) characters && i < quarks.length(); i++) {
4910 : QuickCheckDetails::Position* pos =
4911 0 : details->positions(characters_filled_in);
4912 6504 : char16_t c = quarks[i];
4913 6504 : if (c > char_mask) {
4914 : // If we expect a non-LATIN1 character from an LATIN1 string,
4915 : // there is no way we can match. Not even case independent
4916 : // matching can turn an LATIN1 character into non-LATIN1 or
4917 : // vice versa.
4918 0 : details->set_cannot_match();
4919 0 : pos->determines_perfectly = false;
4920 0 : return;
4921 : }
4922 0 : if (compiler->ignore_case()) {
4923 : char16_t chars[kEcma262UnCanonicalizeMaxWidth];
4924 0 : size_t length = GetCaseIndependentLetters(c, compiler->latin1(),
4925 0 : compiler->unicode(), chars);
4926 82 : MOZ_ASSERT(length != 0); // Can only happen if c > char_mask (see above).
4927 82 : if (length == 1) {
4928 : // This letter has no case equivalents, so it's nice and simple
4929 : // and the mask-compare will determine definitely whether we have
4930 : // a match at this character position.
4931 0 : pos->mask = char_mask;
4932 67 : pos->value = c;
4933 0 : pos->determines_perfectly = true;
4934 : } else {
4935 0 : uint32_t common_bits = char_mask;
4936 0 : uint32_t bits = chars[0];
4937 0 : for (size_t j = 1; j < length; j++) {
4938 0 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
4939 15 : common_bits ^= differing_bits;
4940 15 : bits &= common_bits;
4941 : }
4942 : // If length is 2 and common bits has only one zero in it then
4943 : // our mask and compare instruction will determine definitely
4944 : // whether we have a match at this character position. Otherwise
4945 : // it can only be an approximate check.
4946 0 : uint32_t one_zero = (common_bits | ~char_mask);
4947 15 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
4948 0 : pos->determines_perfectly = true;
4949 : }
4950 15 : pos->mask = common_bits;
4951 15 : pos->value = bits;
4952 : }
4953 : } else {
4954 : // Don't ignore case. Nice simple case where the mask-compare will
4955 : // determine definitely whether we have a match at this character
4956 : // position.
4957 0 : pos->mask = char_mask;
4958 6422 : pos->value = c;
4959 0 : pos->determines_perfectly = true;
4960 : }
4961 0 : characters_filled_in++;
4962 6504 : MOZ_ASSERT(characters_filled_in <= details->characters());
4963 6504 : if (characters_filled_in == details->characters()) {
4964 : return;
4965 : }
4966 : }
4967 : } else {
4968 : QuickCheckDetails::Position* pos =
4969 0 : details->positions(characters_filled_in);
4970 0 : RegExpCharacterClass* tree = elm.char_class();
4971 590 : const CharacterRangeVector& ranges = tree->ranges(alloc());
4972 295 : if (tree->is_negated()) {
4973 : // A quick check uses multi-character mask and compare. There is no
4974 : // useful way to incorporate a negative char class into this scheme
4975 : // so we just conservatively create a mask and value that will always
4976 : // succeed.
4977 47 : pos->mask = 0;
4978 47 : pos->value = 0;
4979 : } else {
4980 : size_t first_range = 0;
4981 0 : while (ranges[first_range].from() > char_mask) {
4982 0 : first_range++;
4983 0 : if (first_range == ranges.length()) {
4984 0 : details->set_cannot_match();
4985 0 : pos->determines_perfectly = false;
4986 0 : return;
4987 : }
4988 : }
4989 0 : CharacterRange range = ranges[first_range];
4990 0 : char16_t from = range.from();
4991 0 : char16_t to = range.to();
4992 248 : if (to > char_mask) {
4993 0 : to = char_mask;
4994 : }
4995 248 : uint32_t differing_bits = (from ^ to);
4996 : // A mask and compare is only perfect if the differing bits form a
4997 : // number like 00011111 with one single block of trailing 1s.
4998 0 : if ((differing_bits & (differing_bits + 1)) == 0 &&
4999 145 : from + differing_bits == to) {
5000 0 : pos->determines_perfectly = true;
5001 : }
5002 0 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
5003 0 : uint32_t bits = (from & common_bits);
5004 0 : for (size_t i = first_range + 1; i < ranges.length(); i++) {
5005 0 : CharacterRange range = ranges[i];
5006 0 : char16_t from = range.from();
5007 0 : char16_t to = range.to();
5008 343 : if (from > char_mask) continue;
5009 258 : if (to > char_mask) to = char_mask;
5010 : // Here we are combining more ranges into the mask and compare
5011 : // value. With each new range the mask becomes more sparse and
5012 : // so the chances of a false positive rise. A character class
5013 : // with multiple ranges is assumed never to be equivalent to a
5014 : // mask and compare operation.
5015 0 : pos->determines_perfectly = false;
5016 0 : uint32_t new_common_bits = (from ^ to);
5017 0 : new_common_bits = ~SmearBitsRight(new_common_bits);
5018 0 : common_bits &= new_common_bits;
5019 0 : bits &= new_common_bits;
5020 0 : uint32_t differing_bits = (from & common_bits) ^ bits;
5021 258 : common_bits ^= differing_bits;
5022 0 : bits &= common_bits;
5023 : }
5024 248 : pos->mask = common_bits;
5025 0 : pos->value = bits;
5026 : }
5027 0 : characters_filled_in++;
5028 295 : MOZ_ASSERT(characters_filled_in <= details->characters());
5029 295 : if (characters_filled_in == details->characters()) {
5030 : return;
5031 : }
5032 : }
5033 : }
5034 0 : MOZ_ASSERT(characters_filled_in != details->characters());
5035 327 : if (!details->cannot_match()) {
5036 327 : on_success()-> GetQuickCheckDetails(details,
5037 : compiler,
5038 : characters_filled_in,
5039 327 : true);
5040 : }
5041 : }
5042 :
5043 : void
5044 0 : QuickCheckDetails::Clear()
5045 : {
5046 0 : for (int i = 0; i < characters_; i++) {
5047 0 : positions_[i].mask = 0;
5048 5006 : positions_[i].value = 0;
5049 0 : positions_[i].determines_perfectly = false;
5050 : }
5051 2564 : characters_ = 0;
5052 0 : }
5053 :
5054 : void
5055 0 : QuickCheckDetails::Advance(int by)
5056 : {
5057 1364 : MOZ_ASSERT(by >= 0);
5058 1364 : if (by >= characters_) {
5059 : Clear();
5060 : return;
5061 : }
5062 466 : for (int i = 0; i < characters_ - by; i++) {
5063 0 : positions_[i] = positions_[by + i];
5064 : }
5065 0 : for (int i = characters_ - by; i < characters_; i++) {
5066 0 : positions_[i].mask = 0;
5067 146 : positions_[i].value = 0;
5068 0 : positions_[i].determines_perfectly = false;
5069 : }
5070 106 : characters_ -= by;
5071 : // We could change mask_ and value_ here but we would never advance unless
5072 : // they had already been used in a check and they won't be used again because
5073 : // it would gain us nothing. So there's no point.
5074 : }
5075 :
5076 : bool
5077 0 : QuickCheckDetails::Rationalize(bool is_latin1)
5078 : {
5079 1185 : bool found_useful_op = false;
5080 0 : uint32_t char_mask = MaximumCharacter(is_latin1);
5081 :
5082 0 : mask_ = 0;
5083 0 : value_ = 0;
5084 0 : int char_shift = 0;
5085 0 : for (int i = 0; i < characters_; i++) {
5086 0 : Position* pos = &positions_[i];
5087 0 : if ((pos->mask & kMaxOneByteCharCode) != 0)
5088 0 : found_useful_op = true;
5089 0 : mask_ |= (pos->mask & char_mask) << char_shift;
5090 4173 : value_ |= (pos->value & char_mask) << char_shift;
5091 0 : char_shift += is_latin1 ? 8 : 16;
5092 : }
5093 1185 : return found_useful_op;
5094 : }
5095 :
5096 0 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index)
5097 : {
5098 955 : MOZ_ASSERT(characters_ == other->characters_);
5099 0 : if (other->cannot_match_)
5100 : return;
5101 0 : if (cannot_match_) {
5102 53 : *this = *other;
5103 0 : return;
5104 : }
5105 0 : for (int i = from_index; i < characters_; i++) {
5106 0 : QuickCheckDetails::Position* pos = positions(i);
5107 0 : QuickCheckDetails::Position* other_pos = other->positions(i);
5108 0 : if (pos->mask != other_pos->mask ||
5109 1125 : pos->value != other_pos->value ||
5110 419 : !other_pos->determines_perfectly) {
5111 : // Our mask-compare operation will be approximate unless we have the
5112 : // exact same operation on both sides of the alternation.
5113 0 : pos->determines_perfectly = false;
5114 : }
5115 0 : pos->mask &= other_pos->mask;
5116 0 : pos->value &= pos->mask;
5117 0 : other_pos->value &= pos->mask;
5118 0 : char16_t differing_bits = (pos->value ^ other_pos->value);
5119 : pos->mask &= ~differing_bits;
5120 : pos->value &= pos->mask;
5121 : }
5122 : }
|