Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99:
3 : * This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : /*
8 : * Streaming access to the raw tokens of JavaScript source.
9 : *
10 : * Because JS tokenization is context-sensitive -- a '/' could be either a
11 : * regular expression *or* a division operator depending on context -- the
12 : * various token stream classes are mostly not useful outside of the Parser
13 : * where they reside. We should probably eventually merge the two concepts.
14 : */
15 : #ifndef frontend_TokenStream_h
16 : #define frontend_TokenStream_h
17 :
18 : /*
19 : * A token stream exposes the raw tokens -- operators, names, numbers,
20 : * keywords, and so on -- of JavaScript source code.
21 : *
22 : * These are the components of the overall token stream concept:
23 : * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<CharT>,
24 : * TokenStreamChars<CharT>, and TokenStreamSpecific<CharT, AnyCharsAccess>.
25 : *
26 : * == TokenStreamShared → ∅ ==
27 : *
28 : * Certain aspects of tokenizing are used everywhere:
29 : *
30 : * * modifiers (used to select which context-sensitive interpretation of a
31 : * character should be used to decide what token it is), modifier
32 : * exceptions, and modifier assertion handling;
33 : * * flags on the overall stream (have we encountered any characters on this
34 : * line? have we hit a syntax error? and so on);
35 : * * and certain token-count constants.
36 : *
37 : * These are all defined in TokenStreamShared. (They could be namespace-
38 : * scoped, but it seems tentatively better not to clutter the namespace.)
39 : *
40 : * == TokenStreamAnyChars → TokenStreamShared ==
41 : *
42 : * Certain aspects of tokenizing have meaning independent of the character type
43 : * of the source text being tokenized: line/column number information, tokens
44 : * in lookahead from determining the meaning of a prior token, compilation
45 : * options, the filename, flags, source map URL, access to details of the
46 : * current and next tokens (is the token of the given type? what name or
47 : * number is contained in the token? and other queries), and others.
48 : *
49 : * All this data/functionality *could* be duplicated for both single-byte and
50 : * double-byte tokenizing, but there are two problems. First, it's potentially
51 : * wasteful if the compiler doesnt recognize it can unify the concepts. (And
52 : * if any-character concepts are intermixed with character-specific concepts,
53 : * potentially the compiler *can't* unify them because offsets into the
54 : * hypothetical TokenStream<CharT>s would differ.) Second, some of this stuff
55 : * needs to be accessible in ParserBase, the aspects of JS language parsing
56 : * that have meaning independent of the character type of the source text being
57 : * parsed. So we need a separate data structure that ParserBase can hold on to
58 : * for it. (ParserBase isn't the only instance of this, but it's certainly the
59 : * biggest case of it.) Ergo, TokenStreamAnyChars.
60 : *
61 : * == TokenStreamCharsBase<CharT> → ∅ ==
62 : *
63 : * Certain data structures in tokenizing are character-type-specific:
64 : * the various pointers identifying the source text (including current offset
65 : * and end) , and the temporary vector into which characters are read/written
66 : * in certain cases (think writing out the actual codepoints identified by an
67 : * identifier containing a Unicode escape, to create the atom for the
68 : * identifier: |a\u0062c| versus |abc|, for example).
69 : *
70 : * Additionally, some functions operating on this data are defined the same way
71 : * no matter what character type you have -- the offset being |offset - start|
72 : * no matter whether those two variables are single- or double-byte pointers.
73 : *
74 : * All such functionality lives in TokenStreamCharsBase<CharT>.
75 : *
76 : * == GeneralTokenStreamChars<CharT, AnyCharsAccess> →
77 : * TokenStreamCharsBase<CharT> ==
78 : *
79 : * Some functionality operates differently on different character types, just
80 : * as for TokenStreamCharsBase, but additionally requires access to character-
81 : * type-agnostic information in TokenStreamAnyChars. For example, getting the
82 : * next character performs different steps for different character types and
83 : * must access TokenStreamAnyChars to update line break information.
84 : *
85 : * Such functionality, if it can be defined using the same algorithm for all
86 : * character types, lives in GeneralTokenStreamChars<CharT, AnyCharsAccess>.
87 : * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
88 : * instance to access its corresponding TokenStreamAnyChars, without inheriting
89 : * from it.
90 : *
91 : * GeneralTokenStreamChars<CharT, AnyCharsAccess> is just functionality, no
92 : * actual member data.
93 : *
94 : * Such functionality all lives in TokenStreamChars<CharT, AnyCharsAccess>, a
95 : * declared-but-not-defined template class whose specializations have a common
96 : * public interface (plus whatever private helper functions are desirable).
97 : *
98 : * == TokenStreamChars<CharT, AnyCharsAccess> →
99 : * GeneralTokenStreamChars<CharT, AnyCharsAccess> ==
100 : *
101 : * Some functionality is like that in GeneralTokenStreamChars, *but* it's
102 : * defined entirely differently for different character types.
103 : *
104 : * For example, consider "match a multi-code unit code point" (hypothetically:
105 : * we've only implemented two-byte tokenizing right now):
106 : *
107 : * * For two-byte text, there must be two code units to get, the leading code
108 : * unit must be a UTF-16 lead surrogate, and the trailing code unit must be
109 : * a UTF-16 trailing surrogate. (If any of these fail to hold, a next code
110 : * unit encodes that code point and is not multi-code unit.)
111 : * * For single-byte Latin-1 text, there are no multi-code unit code points.
112 : * * For single-byte UTF-8 text, the first code unit must have N > 1 of its
113 : * highest bits set (and the next unset), and |N - 1| successive code units
114 : * must have their high bit set and next-highest bit unset, *and*
115 : * concatenating all unconstrained bits together must not produce a code
116 : * point value that could have been encoded in fewer code units.
117 : *
118 : * This functionality can't be implemented as member functions in
119 : * GeneralTokenStreamChars because we'd need to *partially specialize* those
120 : * functions -- hold CharT constant while letting AnyCharsAccess vary. But
121 : * C++ forbids function template partial specialization like this: either you
122 : * fix *all* parameters or you fix none of them.
123 : *
124 : * Fortunately, C++ *does* allow *class* template partial specialization. So
125 : * TokenStreamChars is a template class with one specialization per CharT.
126 : * Functions can be defined differently in the different specializations,
127 : * because AnyCharsAccess as the only template parameter on member functions
128 : * *can* vary.
129 : *
130 : * All TokenStreamChars<CharT, AnyCharsAccess> specializations, one per CharT,
131 : * are just functionality, no actual member data.
132 : *
133 : * == TokenStreamSpecific<CharT, AnyCharsAccess> →
134 : * TokenStreamChars<CharT, AnyCharsAccess>, TokenStreamShared ==
135 : *
136 : * TokenStreamSpecific is operations that are parametrized on character type
137 : * but implement the *general* idea of tokenizing, without being intrinsically
138 : * tied to character type. Notably, this includes all operations that can
139 : * report warnings or errors at particular offsets, because we include a line
140 : * of context with such errors -- and that necessarily accesses the raw
141 : * characters of their specific type.
142 : *
143 : * Much TokenStreamSpecific operation depends on functionality in
144 : * TokenStreamAnyChars. The obvious solution is to inherit it -- but this
145 : * doesn't work in Parser: its ParserBase base class needs some
146 : * TokenStreamAnyChars functionality without knowing character type.
147 : *
148 : * The AnyCharsAccess type parameter is a class that statically converts from a
149 : * TokenStreamSpecific* to its corresponding TokenStreamAnyChars. The
150 : * TokenStreamSpecific in Parser<ParseHandler, CharT> can then specify a class
151 : * that properly converts from TokenStreamSpecific Parser::tokenStream to
152 : * TokenStreamAnyChars ParserBase::anyChars.
153 : *
154 : * Could we hardcode one set of offset calculations for this and eliminate
155 : * AnyCharsAccess? No. Offset calculations possibly could be hardcoded if
156 : * TokenStreamSpecific were present in Parser before Parser::handler, assuring
157 : * the same offsets in all Parser-related cases. But there's still a separate
158 : * TokenStream class, that requires different offset calculations. So even if
159 : * we wanted to hardcode this (it's not clear we would, because forcing the
160 : * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
161 : */
162 :
163 : #include "mozilla/ArrayUtils.h"
164 : #include "mozilla/Assertions.h"
165 : #include "mozilla/Attributes.h"
166 : #include "mozilla/DebugOnly.h"
167 : #include "mozilla/MemoryChecking.h"
168 : #include "mozilla/PodOperations.h"
169 : #include "mozilla/TextUtils.h"
170 : #include "mozilla/TypeTraits.h"
171 : #include "mozilla/Unused.h"
172 :
173 : #include <algorithm>
174 : #include <stdarg.h>
175 : #include <stddef.h>
176 : #include <stdio.h>
177 :
178 : #include "jspubtd.h"
179 :
180 : #include "frontend/ErrorReporter.h"
181 : #include "frontend/TokenKind.h"
182 : #include "js/UniquePtr.h"
183 : #include "js/Vector.h"
184 : #include "util/Unicode.h"
185 : #include "vm/ErrorReporting.h"
186 : #include "vm/JSContext.h"
187 : #include "vm/RegExpShared.h"
188 : #include "vm/StringType.h"
189 :
190 : struct KeywordInfo;
191 :
192 : namespace js {
193 : namespace frontend {
194 :
195 : struct TokenPos {
196 : uint32_t begin; // Offset of the token's first char.
197 : uint32_t end; // Offset of 1 past the token's last char.
198 0 :
199 208612 : TokenPos() {}
200 : TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {}
201 :
202 0 : // Return a TokenPos that covers left, right, and anything in between.
203 0 : static TokenPos box(const TokenPos& left, const TokenPos& right) {
204 0 : MOZ_ASSERT(left.begin <= left.end);
205 0 : MOZ_ASSERT(left.end <= right.begin);
206 1007 : MOZ_ASSERT(right.begin <= right.end);
207 : return TokenPos(left.begin, right.end);
208 : }
209 :
210 : bool operator==(const TokenPos& bpos) const {
211 : return begin == bpos.begin && end == bpos.end;
212 : }
213 :
214 : bool operator!=(const TokenPos& bpos) const {
215 : return begin != bpos.begin || end != bpos.end;
216 : }
217 :
218 : bool operator <(const TokenPos& bpos) const {
219 : return begin < bpos.begin;
220 : }
221 :
222 : bool operator <=(const TokenPos& bpos) const {
223 : return begin <= bpos.begin;
224 : }
225 :
226 : bool operator >(const TokenPos& bpos) const {
227 : return !(*this <= bpos);
228 : }
229 :
230 : bool operator >=(const TokenPos& bpos) const {
231 : return !(*this < bpos);
232 : }
233 :
234 10072 : bool encloses(const TokenPos& pos) const {
235 : return begin <= pos.begin && pos.end <= end;
236 : }
237 : };
238 :
239 : enum DecimalPoint { NoDecimal = false, HasDecimal = true };
240 :
241 : enum class InvalidEscapeType {
242 : // No invalid character escapes.
243 : None,
244 : // A malformed \x escape.
245 : Hexadecimal,
246 : // A malformed \u escape.
247 : Unicode,
248 : // An otherwise well-formed \u escape which represents a
249 : // codepoint > 10FFFF.
250 : UnicodeOverflow,
251 : // An octal escape in a template token.
252 : Octal
253 : };
254 :
255 : // The only escapes found in IdentifierName are of the Unicode flavor.
256 : enum class IdentifierEscapes { None, SawUnicodeEscape };
257 :
258 : class TokenStreamShared;
259 263894 :
260 : struct Token
261 : {
262 : private:
263 : // Sometimes the parser needs to inform the tokenizer to interpret
264 : // subsequent text in a particular manner: for example, to tokenize a
265 : // keyword as an identifier, not as the actual keyword, on the right-hand
266 : // side of a dotted property access. Such information is communicated to
267 : // the tokenizer as a Modifier when getting the next token.
268 : //
269 : // Ideally this definition would reside in TokenStream as that's the real
270 : // user, but the debugging-use of it here causes a cyclic dependency (and
271 : // C++ provides no way to forward-declare an enum inside a class). So
272 : // define it here, then typedef it into TokenStream with static consts to
273 : // bring the initializers into scope.
274 : enum Modifier
275 : {
276 : // Normal operation.
277 : None,
278 :
279 : // Looking for an operand, not an operator. In practice, this means
280 : // that when '/' is seen, we look for a regexp instead of just returning
281 : // Div.
282 : Operand,
283 :
284 : // Treat subsequent characters as the tail of a template literal, after
285 : // a template substitution, beginning with a "}", continuing with zero
286 : // or more template literal characters, and ending with either "${" or
287 : // the end of the template literal. For example:
288 : //
289 : // var entity = "world";
290 : // var s = `Hello ${entity}!`;
291 : // ^ TemplateTail context
292 : TemplateTail,
293 : };
294 : enum ModifierException
295 : {
296 : NoException,
297 :
298 : // Used in following 2 cases:
299 : // a) After |yield| we look for a token on the same line that starts an
300 : // expression (Operand): |yield <expr>|. If no token is found, the
301 : // |yield| stands alone, and the next token on a subsequent line must
302 : // be: a comma continuing a comma expression, a semicolon terminating
303 : // the statement that ended with |yield|, or the start of another
304 : // statement (possibly an expression statement). The comma/semicolon
305 : // cases are gotten as operators (None), contrasting with Operand
306 : // earlier.
307 : // b) After an arrow function with a block body in an expression
308 : // statement, the next token must be: a colon in a conditional
309 : // expression, a comma continuing a comma expression, a semicolon
310 : // terminating the statement, or the token on a subsequent line that is
311 : // the start of another statement (possibly an expression statement).
312 : // Colon is gotten as operator (None), and it should only be gotten in
313 : // conditional expression and missing it results in SyntaxError.
314 : // Comma/semicolon cases are also gotten as operators (None), and 4th
315 : // case is gotten after them. If no comma/semicolon found but EOL,
316 : // the next token should be gotten as operand in 4th case (especially if
317 : // '/' is the first character). So we should peek the token as
318 : // operand before try getting colon/comma/semicolon.
319 : // See also the comment in Parser::assignExpr().
320 : NoneIsOperand,
321 :
322 : // If a semicolon is inserted automatically, the next token is already
323 : // gotten with None, but we expect Operand.
324 : OperandIsNone,
325 : };
326 : friend class TokenStreamShared;
327 :
328 : public:
329 : // WARNING: TokenStreamPosition assumes that the only GC things a Token
330 : // includes are atoms. DON'T ADD NON-ATOM GC THING POINTERS HERE
331 : // UNLESS YOU ADD ADDITIONAL ROOTING TO THAT CLASS.
332 :
333 : TokenKind type; // char value or above enumerator
334 : TokenPos pos; // token position in file
335 : union {
336 : private:
337 : friend struct Token;
338 : PropertyName* name; // non-numeric atom
339 : JSAtom* atom; // potentially-numeric atom
340 : struct {
341 : double value; // floating point number
342 : DecimalPoint decimalPoint; // literal contains '.'
343 : } number;
344 : RegExpFlag reflags; // regexp flags; use tokenbuf to access
345 : // regexp chars
346 : } u;
347 : #ifdef DEBUG
348 : Modifier modifier; // Modifier used to get this token
349 : ModifierException modifierException; // Exception for this modifier
350 : #endif
351 :
352 : // Mutators
353 0 :
354 0 : void setName(PropertyName* name) {
355 0 : MOZ_ASSERT(type == TokenKind::Name);
356 242692 : u.name = name;
357 : }
358 0 :
359 32081 : void setAtom(JSAtom* atom) {
360 : MOZ_ASSERT(type == TokenKind::String ||
361 : type == TokenKind::TemplateHead ||
362 0 : type == TokenKind::NoSubsTemplate);
363 32081 : u.atom = atom;
364 : }
365 0 :
366 0 : void setRegExpFlags(RegExpFlag flags) {
367 0 : MOZ_ASSERT(type == TokenKind::RegExp);
368 0 : MOZ_ASSERT((flags & AllFlags) == flags);
369 296 : u.reflags = flags;
370 : }
371 0 :
372 0 : void setNumber(double n, DecimalPoint decimalPoint) {
373 0 : MOZ_ASSERT(type == TokenKind::Number);
374 0 : u.number.value = n;
375 8503 : u.number.decimalPoint = decimalPoint;
376 : }
377 :
378 : // Type-safe accessors
379 0 :
380 0 : PropertyName* name() const {
381 393375 : MOZ_ASSERT(type == TokenKind::Name);
382 : return u.name->JSAtom::asPropertyName(); // poor-man's type verification
383 : }
384 0 :
385 32831 : JSAtom* atom() const {
386 : MOZ_ASSERT(type == TokenKind::String ||
387 : type == TokenKind::TemplateHead ||
388 32831 : type == TokenKind::NoSubsTemplate);
389 : return u.atom;
390 : }
391 0 :
392 0 : RegExpFlag regExpFlags() const {
393 0 : MOZ_ASSERT(type == TokenKind::RegExp);
394 296 : MOZ_ASSERT((u.reflags & AllFlags) == u.reflags);
395 : return u.reflags;
396 : }
397 0 :
398 0 : double number() const {
399 8510 : MOZ_ASSERT(type == TokenKind::Number);
400 : return u.number.value;
401 : }
402 0 :
403 0 : DecimalPoint decimalPoint() const {
404 8503 : MOZ_ASSERT(type == TokenKind::Number);
405 : return u.number.decimalPoint;
406 : }
407 : };
408 :
409 : extern TokenKind
410 : ReservedWordTokenKind(PropertyName* str);
411 :
412 : extern const char*
413 : ReservedWordToCharZ(PropertyName* str);
414 :
415 : extern const char*
416 : ReservedWordToCharZ(TokenKind tt);
417 :
418 : // Ideally, tokenizing would be entirely independent of context. But the
419 : // strict mode flag, which is in SharedContext, affects tokenizing, and
420 : // TokenStream needs to see it.
421 : //
422 : // This class is a tiny back-channel from TokenStream to the strict mode flag
423 : // that avoids exposing the rest of SharedContext to TokenStream.
424 1221 : //
425 : class StrictModeGetter {
426 : public:
427 : virtual bool strictMode() = 0;
428 : };
429 :
430 : struct TokenStreamFlags
431 : {
432 : bool isEOF:1; // Hit end of file.
433 : bool isDirtyLine:1; // Non-whitespace since start of line.
434 : bool sawOctalEscape:1; // Saw an octal character escape.
435 : bool hadError:1; // Hit a syntax error, at start or during a
436 : // token.
437 :
438 127363 : TokenStreamFlags()
439 : : isEOF(), isDirtyLine(), sawOctalEscape(), hadError()
440 : {}
441 : };
442 :
443 : template<typename CharT>
444 : class TokenStreamPosition;
445 :
446 : /**
447 : * TokenStream types and constants that are used in both TokenStreamAnyChars
448 : * and TokenStreamSpecific. Do not add any non-static data members to this
449 : * class!
450 : */
451 : class TokenStreamShared
452 : {
453 : protected:
454 : static constexpr size_t ntokens = 4; // 1 current + 2 lookahead, rounded
455 : // to power of 2 to avoid divmod by 3
456 :
457 : static constexpr unsigned ntokensMask = ntokens - 1;
458 :
459 : template<typename CharT> friend class TokenStreamPosition;
460 :
461 : public:
462 : static constexpr unsigned maxLookahead = 2;
463 :
464 : static constexpr uint32_t NoOffset = UINT32_MAX;
465 :
466 : using Modifier = Token::Modifier;
467 : static constexpr Modifier None = Token::None;
468 : static constexpr Modifier Operand = Token::Operand;
469 : static constexpr Modifier TemplateTail = Token::TemplateTail;
470 :
471 : using ModifierException = Token::ModifierException;
472 : static constexpr ModifierException NoException = Token::NoException;
473 : static constexpr ModifierException NoneIsOperand = Token::NoneIsOperand;
474 : static constexpr ModifierException OperandIsNone = Token::OperandIsNone;
475 :
476 1632720 : static void
477 : verifyConsistentModifier(Modifier modifier, Token lookaheadToken)
478 : {
479 : #ifdef DEBUG
480 1632720 : // Easy case: modifiers match.
481 : if (modifier == lookaheadToken.modifier)
482 : return;
483 568421 :
484 : if (lookaheadToken.modifierException == OperandIsNone) {
485 568422 : // getToken(Operand) permissibly following getToken().
486 : if (modifier == Operand && lookaheadToken.modifier == None)
487 : return;
488 : }
489 0 :
490 : if (lookaheadToken.modifierException == NoneIsOperand) {
491 0 : // getToken() permissibly following getToken(Operand).
492 : if (modifier == None && lookaheadToken.modifier == Operand)
493 : return;
494 : }
495 0 :
496 : MOZ_ASSERT_UNREACHABLE("this token was previously looked up with a "
497 : "different modifier, potentially making "
498 : "tokenization non-deterministic");
499 : #endif
500 : }
501 : };
502 :
503 : static_assert(mozilla::IsEmpty<TokenStreamShared>::value,
504 : "TokenStreamShared shouldn't bloat classes that inherit from it");
505 :
506 : template<typename CharT, class AnyCharsAccess>
507 : class TokenStreamSpecific;
508 :
509 : template<typename CharT>
510 : class MOZ_STACK_CLASS TokenStreamPosition final
511 : {
512 : public:
513 : // The JS_HAZ_ROOTED is permissible below because: 1) the only field in
514 : // TokenStreamPosition that can keep GC things alive is Token, 2) the only
515 : // GC things Token can keep alive are atoms, and 3) the AutoKeepAtoms&
516 : // passed to the constructor here represents that collection of atoms
517 : // is disabled while atoms in Tokens in this Position are alive. DON'T
518 : // ADD NON-ATOM GC THING POINTERS HERE! They would create a rooting
519 : // hazard that JS_HAZ_ROOTED will cause to be ignored.
520 : template<class AnyCharsAccess>
521 : inline TokenStreamPosition(AutoKeepAtoms& keepAtoms,
522 : TokenStreamSpecific<CharT, AnyCharsAccess>& tokenStream);
523 :
524 : private:
525 : TokenStreamPosition(const TokenStreamPosition&) = delete;
526 :
527 : // Technically only TokenStreamSpecific<CharT, AnyCharsAccess>::seek with
528 : // CharT constant and AnyCharsAccess varying must be friended, but 1) it's
529 : // hard to friend one function in template classes, and 2) C++ doesn't
530 : // allow partial friend specialization to target just that single class.
531 : template<typename Char, class AnyCharsAccess> friend class TokenStreamSpecific;
532 :
533 : const CharT* buf;
534 : TokenStreamFlags flags;
535 : unsigned lineno;
536 : size_t linebase;
537 : size_t prevLinebase;
538 : Token currentToken;
539 : unsigned lookahead;
540 : Token lookaheadTokens[TokenStreamShared::maxLookahead];
541 : } JS_HAZ_ROOTED;
542 6112 :
543 : class TokenStreamAnyChars
544 : : public TokenStreamShared
545 : {
546 : public:
547 : TokenStreamAnyChars(JSContext* cx, const ReadOnlyCompileOptions& options,
548 : StrictModeGetter* smg);
549 :
550 : template<typename CharT, class AnyCharsAccess> friend class GeneralTokenStreamChars;
551 : template<typename CharT, class AnyCharsAccess> friend class TokenStreamChars;
552 : template<typename CharT, class AnyCharsAccess> friend class TokenStreamSpecific;
553 :
554 : template<typename CharT> friend class TokenStreamPosition;
555 :
556 : // Accessors.
557 0 : unsigned cursor() const { return cursor_; }
558 186690 : unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
559 : unsigned aheadCursor(unsigned steps) const { return (cursor_ + steps) & ntokensMask; }
560 5272388 :
561 : const Token& currentToken() const { return tokens[cursor()]; }
562 777430 : bool isCurrentTokenType(TokenKind type) const {
563 : return currentToken().type == type;
564 : }
565 :
566 : MOZ_MUST_USE bool checkOptions();
567 :
568 : private:
569 : PropertyName* reservedWordToPropertyName(TokenKind tt) const;
570 :
571 0 : public:
572 0 : PropertyName* currentName() const {
573 245898 : if (isCurrentTokenType(TokenKind::Name))
574 : return currentToken().name();
575 0 :
576 4958 : MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
577 : return reservedWordToPropertyName(currentToken().type);
578 : }
579 0 :
580 0 : bool currentNameHasEscapes() const {
581 0 : if (isCurrentTokenType(TokenKind::Name)) {
582 442478 : TokenPos pos = currentToken().pos;
583 : return (pos.end - pos.begin) != currentToken().name()->length();
584 : }
585 670 :
586 : MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
587 : return false;
588 : }
589 :
590 195644 : bool isCurrentTokenAssignment() const {
591 : return TokenKindIsAssignment(currentToken().type);
592 : }
593 :
594 0 : // Flag methods.
595 0 : bool isEOF() const { return flags.isEOF; }
596 0 : bool sawOctalEscape() const { return flags.sawOctalEscape; }
597 12410 : bool hadError() const { return flags.hadError; }
598 : void clearSawOctalEscape() { flags.sawOctalEscape = false; }
599 :
600 : bool hasInvalidTemplateEscape() const {
601 : return invalidTemplateEscapeType != InvalidEscapeType::None;
602 : }
603 0 : void clearInvalidTemplateEscape() {
604 : invalidTemplateEscapeType = InvalidEscapeType::None;
605 : }
606 :
607 : private:
608 : // This is private because it should only be called by the tokenizer while
609 0 : // tokenizing not by, for example, BytecodeEmitter.
610 : bool strictMode() const { return strictModeGetter && strictModeGetter->strictMode(); }
611 0 :
612 0 : void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
613 0 : MOZ_ASSERT(type != InvalidEscapeType::None);
614 : if (invalidTemplateEscapeType != InvalidEscapeType::None)
615 0 : return;
616 0 : invalidTemplateEscapeOffset = offset;
617 : invalidTemplateEscapeType = type;
618 : }
619 :
620 : uint32_t invalidTemplateEscapeOffset = 0;
621 : InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
622 :
623 180006 : public:
624 : void addModifierException(ModifierException modifierException) {
625 180006 : #ifdef DEBUG
626 : const Token& next = nextToken();
627 :
628 : // Permit adding the same exception multiple times. This is important
629 : // particularly for Parser::assignExpr's early fast-path cases and
630 : // arrow function parsing: we want to add modifier exceptions in the
631 : // fast paths, then potentially (but not necessarily) duplicate them
632 180006 : // after parsing all of an arrow function.
633 : if (next.modifierException == modifierException)
634 : return;
635 180006 :
636 : if (next.modifierException == NoneIsOperand) {
637 : // Token after yield expression without operand already has
638 0 : // NoneIsOperand exception.
639 0 : MOZ_ASSERT(modifierException == OperandIsNone);
640 : MOZ_ASSERT(next.type != TokenKind::Div,
641 : "next token requires contextual specifier to be parsed unambiguously");
642 :
643 : // Do not update modifierException.
644 : return;
645 : }
646 0 :
647 180006 : MOZ_ASSERT(next.modifierException == NoException);
648 : switch (modifierException) {
649 0 : case NoneIsOperand:
650 0 : MOZ_ASSERT(next.modifier == Operand);
651 : MOZ_ASSERT(next.type != TokenKind::Div,
652 : "next token requires contextual specifier to be parsed unambiguously");
653 : break;
654 0 : case OperandIsNone:
655 180006 : MOZ_ASSERT(next.modifier == None);
656 : MOZ_ASSERT(next.type != TokenKind::Div && next.type != TokenKind::RegExp,
657 : "next token requires contextual specifier to be parsed unambiguously");
658 : break;
659 0 : default:
660 : MOZ_CRASH("unexpected modifier exception");
661 360012 : }
662 : tokens[nextCursor()].modifierException = modifierException;
663 : #endif
664 : }
665 :
666 : #ifdef DEBUG
667 : inline bool debugHasNoLookahead() const {
668 : return lookahead == 0;
669 : }
670 : #endif
671 :
672 2442 : bool hasDisplayURL() const {
673 : return displayURL_ != nullptr;
674 : }
675 :
676 36 : char16_t* displayURL() {
677 : return displayURL_.get();
678 : }
679 :
680 2442 : bool hasSourceMapURL() const {
681 : return sourceMapURL_ != nullptr;
682 : }
683 :
684 0 : char16_t* sourceMapURL() {
685 : return sourceMapURL_.get();
686 : }
687 :
688 : // This class maps a sourceUnits offset (which is 0-indexed) to a line
689 1528 : // number (which is 1-indexed) and a column index (which is 0-indexed).
690 : class SourceCoords
691 : {
692 : // For a given buffer holding source code, |lineStartOffsets_| has one
693 : // element per line of source code, plus one sentinel element. Each
694 : // non-sentinel element holds the buffer offset for the start of the
695 : // corresponding line of source code. For this example script,
696 : // assuming an initialLineOffset of 0:
697 : //
698 : // 1 // xyz [line starts at offset 0]
699 : // 2 var x; [line starts at offset 7]
700 : // 3 [line starts at offset 14]
701 : // 4 var y; [line starts at offset 15]
702 : //
703 : // |lineStartOffsets_| is:
704 : //
705 : // [0, 7, 14, 15, MAX_PTR]
706 : //
707 : // To convert a "line number" to a "line index" (i.e. an index into
708 : // |lineStartOffsets_|), subtract |initialLineNum_|. E.g. line 3's
709 : // line index is (3 - initialLineNum_), which is 2. Therefore
710 : // lineStartOffsets_[2] holds the buffer offset for the start of line 3,
711 : // which is 14. (Note that |initialLineNum_| is often 1, but not
712 : // always.)
713 : //
714 : // The first element is always initialLineOffset, passed to the
715 : // constructor, and the last element is always the MAX_PTR sentinel.
716 : //
717 : // offset-to-line/column lookups are O(log n) in the worst case (binary
718 : // search), but in practice they're heavily clustered and we do better
719 : // than that by using the previous lookup's result (lastLineIndex_) as
720 : // a starting point.
721 : //
722 : // Checking if an offset lies within a particular line number
723 : // (isOnThisLine()) is O(1).
724 : //
725 : Vector<uint32_t, 128> lineStartOffsets_;
726 : uint32_t initialLineNum_;
727 : uint32_t initialColumn_;
728 :
729 : // This is mutable because it's modified on every search, but that fact
730 : // isn't visible outside this class.
731 : mutable uint32_t lastLineIndex_;
732 :
733 : uint32_t lineIndexOf(uint32_t offset) const;
734 :
735 : static const uint32_t MAX_PTR = UINT32_MAX;
736 0 :
737 0 : uint32_t lineIndexToNum(uint32_t lineIndex) const { return lineIndex + initialLineNum_; }
738 0 : uint32_t lineNumToIndex(uint32_t lineNum) const { return lineNum - initialLineNum_; }
739 0 : uint32_t lineIndexAndOffsetToColumn(uint32_t lineIndex, uint32_t offset) const {
740 0 : uint32_t lineStartOffset = lineStartOffsets_[lineIndex];
741 0 : MOZ_RELEASE_ASSERT(offset >= lineStartOffset);
742 0 : uint32_t column = offset - lineStartOffset;
743 2418 : if (lineIndex == 0)
744 : return column + initialColumn_;
745 : return column;
746 : }
747 :
748 : public:
749 : SourceCoords(JSContext* cx, uint32_t ln, uint32_t col, uint32_t initialLineOffset);
750 :
751 : MOZ_MUST_USE bool add(uint32_t lineNum, uint32_t lineStartOffset);
752 : MOZ_MUST_USE bool fill(const SourceCoords& other);
753 0 :
754 0 : bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
755 945766 : uint32_t lineIndex = lineNumToIndex(lineNum);
756 : if (lineIndex + 1 >= lineStartOffsets_.length()) // +1 due to sentinel
757 0 : return false;
758 0 : *onThisLine = lineStartOffsets_[lineIndex] <= offset &&
759 945775 : offset < lineStartOffsets_[lineIndex + 1];
760 : return true;
761 : }
762 :
763 : uint32_t lineNum(uint32_t offset) const;
764 : uint32_t columnIndex(uint32_t offset) const;
765 : void lineNumAndColumnIndex(uint32_t offset, uint32_t* lineNum, uint32_t* column) const;
766 : };
767 :
768 : SourceCoords srcCoords;
769 :
770 : JSContext* context() const {
771 : return cx;
772 : }
773 :
774 : /**
775 : * Fill in |err|, excepting line-of-context-related fields. If the token
776 : * stream has location information, use that and return true. If it does
777 : * not, use the caller's location information and return false.
778 : */
779 : bool fillExcludingContext(ErrorMetadata* err, uint32_t offset);
780 :
781 : void updateFlagsForEOL();
782 :
783 : private:
784 : MOZ_MUST_USE MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(uint32_t lineStartOffset);
785 :
786 : void undoInternalUpdateLineInfoForEOL();
787 :
788 0 : public:
789 0 : const Token& nextToken() const {
790 2061698 : MOZ_ASSERT(hasLookahead());
791 : return tokens[nextCursor()];
792 : }
793 :
794 : bool hasLookahead() const { return lookahead > 0; }
795 :
796 2111543 : void advanceCursor() {
797 : cursor_ = (cursor_ + 1) & ntokensMask;
798 : }
799 :
800 1226775 : void retractCursor() {
801 : cursor_ = (cursor_ - 1) & ntokensMask;
802 : }
803 :
804 : Token* allocateToken() {
805 : advanceCursor();
806 883823 :
807 : Token* tp = &tokens[cursor()];
808 : MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
809 :
810 : return tp;
811 : }
812 :
813 0 : // Push the last scanned token back into the stream.
814 0 : void ungetToken() {
815 0 : MOZ_ASSERT(lookahead < maxLookahead);
816 0 : lookahead++;
817 1226775 : retractCursor();
818 : }
819 :
820 : public:
821 : MOZ_MUST_USE bool compileWarning(ErrorMetadata&& metadata, UniquePtr<JSErrorNotes> notes,
822 : unsigned flags, unsigned errorNumber, va_list args);
823 :
824 : // Compute error metadata for an error at no offset.
825 : void computeErrorMetadataNoOffset(ErrorMetadata* err);
826 :
827 : // ErrorReporter API Helpers
828 :
829 : void lineAndColumnAt(size_t offset, uint32_t *line, uint32_t *column) const;
830 :
831 : // This is just straight up duplicated from TokenStreamSpecific's inheritance of
832 : // ErrorReporter's reportErrorNoOffset. varargs delenda est.
833 : void reportErrorNoOffset(unsigned errorNumber, ...);
834 : void reportErrorNoOffsetVA(unsigned errorNumber, va_list args);
835 :
836 : const JS::ReadOnlyCompileOptions& options() const {
837 : return options_;
838 : }
839 :
840 : const char* getFilename() const {
841 : return filename_;
842 : }
843 :
844 : protected:
845 : // Options used for parsing/tokenizing.
846 : const ReadOnlyCompileOptions& options_;
847 :
848 : Token tokens[ntokens]; // circular token buffer
849 : private:
850 : unsigned cursor_; // index of last parsed token
851 : protected:
852 : unsigned lookahead; // count of lookahead tokens
853 : unsigned lineno; // current line number
854 : TokenStreamFlags flags; // flags -- see above
855 : size_t linebase; // start of current line
856 : size_t prevLinebase; // start of previous line; size_t(-1) if on the first line
857 : const char* filename_; // input filename or null
858 : UniqueTwoByteChars displayURL_; // the user's requested source URL or null
859 : UniqueTwoByteChars sourceMapURL_; // source map's filename or null
860 :
861 : /**
862 : * An array storing whether a TokenKind observed while attempting to extend
863 : * a valid AssignmentExpression into an even longer AssignmentExpression
864 : * (e.g., extending '3' to '3 + 5') will terminate it without error.
865 : *
866 : * For example, ';' always ends an AssignmentExpression because it ends a
867 : * Statement or declaration. '}' always ends an AssignmentExpression
868 : * because it terminates BlockStatement, FunctionBody, and embedded
869 : * expressions in TemplateLiterals. Therefore both entries are set to true
870 : * in TokenStreamAnyChars construction.
871 : *
872 : * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
873 : * is false. Meanwhile 'this' can't extend an AssignmentExpression, but
874 : * it's only valid after a line break, so its entry here must be false.
875 : *
876 : * NOTE: This array could be static, but without C99's designated
877 : * initializers it's easier zeroing here and setting the true entries
878 : * in the constructor body. (Having this per-instance might also aid
879 : * locality.) Don't worry! Initialization time for each TokenStream
880 : * is trivial. See bug 639420.
881 : */
882 : bool isExprEnding[size_t(TokenKind::Limit)] = {}; // all-false initially
883 :
884 : JSContext* const cx;
885 : bool mutedErrors;
886 : StrictModeGetter* strictModeGetter; // used to test for strict mode
887 : };
888 :
889 : // This is the low-level interface to the JS source code buffer. It just gets
890 : // raw Unicode code units -- 16-bit char16_t units of source text that are not
891 : // (always) full code points, and 8-bit units of UTF-8 source text soon.
892 : // TokenStreams functions are layered on top and do some extra stuff like
893 : // converting all EOL sequences to '\n', tracking the line number, and setting
894 : // |flags.isEOF|. (The "raw" in "raw Unicode code units" refers to the lack of
895 : // EOL sequence normalization.)
896 : //
897 : // buf[0..length-1] often represents a substring of some larger source,
898 : // where we have only the substring in memory. The |startOffset| argument
899 : // indicates the offset within this larger string at which our string
900 : // begins, the offset of |buf[0]|.
901 : template<typename CharT>
902 : class SourceUnits
903 : {
904 0 : public:
905 : SourceUnits(const CharT* buf, size_t length, size_t startOffset)
906 : : base_(buf),
907 1528 : startOffset_(startOffset),
908 3056 : limit_(buf + length),
909 : ptr(buf)
910 : { }
911 :
912 5765321 : bool atStart() const {
913 : MOZ_ASSERT(ptr, "shouldn't be using if poisoned");
914 : return ptr == base_;
915 : }
916 1864891 :
917 : bool atEnd() const {
918 : MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
919 : return ptr >= limit_;
920 0 : }
921 :
922 : size_t startOffset() const {
923 : return startOffset_;
924 3866339 : }
925 :
926 : size_t offset() const {
927 26 : return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
928 26 : }
929 26 :
930 26 : const CharT* codeUnitPtrAt(size_t offset) const {
931 : MOZ_ASSERT(startOffset_ <= offset);
932 : MOZ_ASSERT(offset - startOffset_ <= mozilla::PointerRangeSize(base_, limit_));
933 : return base_ + (offset - startOffset_);
934 : }
935 :
936 : const CharT* limit() const {
937 1224940 : return limit_;
938 1224940 : }
939 1224954 :
940 0 : CharT previousCodeUnit() {
941 : MOZ_ASSERT(ptr, "can't get previous code unit if poisoned");
942 : MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
943 : return *(ptr - 1);
944 0 : }
945 :
946 : CharT getCodeUnit() {
947 : return *ptr++; // this will nullptr-crash if poisoned
948 852 : }
949 :
950 : CharT peekCodeUnit() const {
951 : return *ptr; // this will nullptr-crash if poisoned
952 167394 : }
953 47240 :
954 : bool peekCodeUnits(uint8_t n, CharT* out) const {
955 : MOZ_ASSERT(ptr, "shouldn't peek into poisoned SourceUnits");
956 : if (n > mozilla::PointerRangeSize(ptr, limit_))
957 : return false;
958 :
959 : std::copy_n(ptr, n, out);
960 : return true;
961 : }
962 :
963 : void skipCodeUnits(uint32_t n) {
964 18 : MOZ_ASSERT(ptr, "shouldn't use poisoned SourceUnits");
965 18 : MOZ_ASSERT(n <= mozilla::PointerRangeSize(ptr, limit_),
966 18 : "shouldn't skip beyond end of SourceUnits");
967 : ptr += n;
968 : }
969 :
970 0 : void unskipCodeUnits(uint32_t n) {
971 0 : MOZ_ASSERT(ptr, "shouldn't use poisoned SourceUnits");
972 18 : MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
973 : "shouldn't skip beyond start of SourceUnits");
974 639918 : ptr -= n;
975 639919 : }
976 639919 :
977 639919 : bool matchCodeUnit(CharT c) {
978 639919 : if (*ptr == c) { // this will nullptr-crash if poisoned
979 : ptr++;
980 833199 : return true;
981 833199 : }
982 833199 : return false;
983 : }
984 :
985 : /**
986 2408 : * Unget the '\n' (CR) that precedes a '\n' (LF), when ungetting a line
987 2408 : * terminator that's a full "\r\n" sequence. If the prior code unit isn't
988 2408 : * '\r', do nothing.
989 2408 : */
990 : void ungetOptionalCRBeforeLF() {
991 : MOZ_ASSERT(ptr, "shouldn't unget a '\\r' from poisoned SourceUnits");
992 : MOZ_ASSERT(*ptr == CharT('\n'),
993 : "function should only be called when a '\\n' was just "
994 0 : "ungotten, and any '\\r' preceding it must also be "
995 : "ungotten");
996 : if (*(ptr - 1) == CharT('\r'))
997 : ptr--;
998 : }
999 2042068 :
1000 1021034 : void ungetCodeUnit() {
1001 1021034 : MOZ_ASSERT(!atStart(), "can't unget if currently at start");
1002 : MOZ_ASSERT(ptr); // make sure it hasn't been poisoned
1003 : ptr--;
1004 : }
1005 :
1006 : const CharT* addressOfNextCodeUnit(bool allowPoisoned = false) const {
1007 : MOZ_ASSERT_IF(!allowPoisoned, ptr); // make sure it hasn't been poisoned
1008 : return ptr;
1009 : }
1010 :
1011 : // Use this with caution!
1012 : void setAddressOfNextCodeUnit(const CharT* a, bool allowPoisoned = false) {
1013 : MOZ_ASSERT_IF(!allowPoisoned, a);
1014 : ptr = a;
1015 : }
1016 :
1017 : // Poison the SourceUnits so they can't be accessed again.
1018 : void poisonInDebug() {
1019 : #ifdef DEBUG
1020 : ptr = nullptr;
1021 : #endif
1022 : }
1023 :
1024 0 : static bool isRawEOLChar(int32_t c) {
1025 : return c == '\n' ||
1026 : c == '\r' ||
1027 : c == unicode::LINE_SEPARATOR ||
1028 : c == unicode::PARA_SEPARATOR;
1029 : }
1030 :
1031 : // Returns the offset of the next EOL, but stops once 'max' characters
1032 : // have been scanned (*including* the char at startOffset_).
1033 : size_t findEOLMax(size_t start, size_t max);
1034 :
1035 : private:
1036 : /** Base of buffer. */
1037 0 : const CharT* base_;
1038 :
1039 : /** Offset of base_[0]. */
1040 : uint32_t startOffset_;
1041 :
1042 : /** Limit for quick bounds check. */
1043 : const CharT* limit_;
1044 :
1045 : /** Next char to get. */
1046 : const CharT* ptr;
1047 167394 : };
1048 167394 :
1049 167394 : template<typename CharT>
1050 502182 : class TokenStreamCharsBase
1051 : {
1052 : protected:
1053 : void ungetCodeUnit(int32_t c) {
1054 : if (c == EOF)
1055 13 : return;
1056 7851 :
1057 : sourceUnits.ungetCodeUnit();
1058 : }
1059 :
1060 : public:
1061 3919 : using CharBuffer = Vector<CharT, 32>;
1062 3919 :
1063 0 : TokenStreamCharsBase(JSContext* cx, const CharT* chars, size_t length, size_t startOffset);
1064 0 :
1065 0 : static MOZ_ALWAYS_INLINE JSAtom*
1066 : atomizeChars(JSContext* cx, const CharT* chars, size_t length);
1067 :
1068 3919 : const CharBuffer& getTokenbuf() const { return tokenbuf; }
1069 0 :
1070 : MOZ_MUST_USE bool copyTokenbufTo(JSContext* cx,
1071 3919 : UniquePtr<char16_t[], JS::FreePolicy>* destination);
1072 :
1073 : using SourceUnits = frontend::SourceUnits<CharT>;
1074 :
1075 : MOZ_MUST_USE bool appendCodePointToTokenbuf(uint32_t codePoint);
1076 :
1077 : // |expect| cannot be an EOL char.
1078 : bool matchCodeUnit(int32_t expect) {
1079 : MOZ_ASSERT(expect != EOF, "shouldn't be matching EOFs");
1080 : MOZ_ASSERT(!SourceUnits::isRawEOLChar(expect));
1081 : return MOZ_LIKELY(!sourceUnits.atEnd()) && sourceUnits.matchCodeUnit(expect);
1082 0 : }
1083 2454190 :
1084 : protected:
1085 : int32_t peekCodeUnit() {
1086 : return MOZ_LIKELY(!sourceUnits.atEnd()) ? sourceUnits.peekCodeUnit() : EOF;
1087 : }
1088 :
1089 : void consumeKnownCodeUnit(int32_t unit) {
1090 : MOZ_ASSERT(unit != EOF, "shouldn't be matching EOF");
1091 : MOZ_ASSERT(!sourceUnits.atEnd(), "must have units to consume");
1092 : #ifdef DEBUG
1093 : CharT next =
1094 : #endif
1095 : sourceUnits.getCodeUnit();
1096 : MOZ_ASSERT(next == unit, "must be consuming the correct unit");
1097 : }
1098 274740 :
1099 : MOZ_MUST_USE bool
1100 : fillWithTemplateStringContents(CharBuffer& charbuf, const CharT* cur, const CharT* end) {
1101 : while (cur < end) {
1102 : // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR are
1103 : // interpreted literally inside template literal contents; only
1104 : // literal CRLF sequences are normalized to '\n'. See
1105 : // <https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv>.
1106 : CharT ch = *cur;
1107 : if (ch == '\r') {
1108 : ch = '\n';
1109 : if ((cur + 1 < end) && (*(cur + 1) == '\n'))
1110 : cur++;
1111 : }
1112 :
1113 0 : if (!charbuf.append(ch))
1114 905471 : return false;
1115 :
1116 : cur++;
1117 : }
1118 :
1119 883819 : return true;
1120 : }
1121 :
1122 : /**
1123 3056 : * Determine whether a code unit constitutes a complete ASCII code point.
1124 : * (The code point's exact value might not be used, however, if subsequent
1125 : * code observes that |unit| is part of a LineTerminatorSequence.)
1126 : */
1127 : static constexpr MOZ_ALWAYS_INLINE MOZ_MUST_USE bool isAsciiCodePoint(CharT unit) {
1128 : return mozilla::IsAscii(unit);
1129 : }
1130 :
1131 : protected:
1132 : /** Code units in the source code being tokenized. */
1133 : SourceUnits sourceUnits;
1134 :
1135 : /** Current token string buffer. */
1136 : CharBuffer tokenbuf;
1137 884014 : };
1138 :
1139 : template<>
1140 : /* static */ MOZ_ALWAYS_INLINE JSAtom*
1141 : TokenStreamCharsBase<char16_t>::atomizeChars(JSContext* cx, const char16_t* chars, size_t length)
1142 : {
1143 884019 : return AtomizeChars(cx, chars, length);
1144 884019 : }
1145 :
1146 : /** A small class encapsulating computation of the start-offset of a Token. */
1147 : class TokenStart
1148 : {
1149 : uint32_t startOffset_;
1150 :
1151 : public:
1152 : /**
1153 : * Compute a starting offset that is the current offset of |sourceUnits|,
1154 : * offset by |adjust|. (For example, |adjust| of -1 indicates the code
1155 : * unit one backwards from |sourceUnits|'s current offset.)
1156 0 : */
1157 : template<class SourceUnits>
1158 : TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
1159 4807356 : : startOffset_(sourceUnits.offset() + adjust)
1160 : {}
1161 :
1162 : TokenStart(const TokenStart&) = default;
1163 0 :
1164 : uint32_t offset() const { return startOffset_; }
1165 : };
1166 :
1167 : template<typename CharT, class AnyCharsAccess>
1168 : class GeneralTokenStreamChars
1169 : : public TokenStreamCharsBase<CharT>
1170 : {
1171 : using CharsSharedBase = TokenStreamCharsBase<CharT>;
1172 :
1173 : Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);
1174 :
1175 : /**
1176 : * Allocates a new Token from the given offset to the current offset,
1177 : * ascribes it the given kind, and sets |*out| to that kind.
1178 600447 : */
1179 : Token* newToken(TokenKind kind, TokenStart start, TokenStreamShared::Modifier modifier,
1180 : TokenKind* out)
1181 8503 : {
1182 : Token* token = newTokenInternal(kind, start, out);
1183 :
1184 8503 : #ifdef DEBUG
1185 8503 : // Save the modifier used to get this token, so that if an ungetToken()
1186 8503 : // occurs and then the token is re-gotten (or peeked, etc.), we can
1187 : // assert both gets used compatible modifiers.
1188 32081 : token->modifier = modifier;
1189 : token->modifierException = TokenStreamShared::NoException;
1190 : #endif
1191 32081 :
1192 : return token;
1193 : }
1194 :
1195 32081 : uint32_t matchUnicodeEscape(uint32_t* codePoint);
1196 32081 : uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);
1197 32081 :
1198 : protected:
1199 242688 : using typename CharsSharedBase::SourceUnits;
1200 :
1201 : using CharsSharedBase::sourceUnits;
1202 242692 :
1203 242692 : public:
1204 242692 : using CharsSharedBase::CharsSharedBase;
1205 :
1206 296 : TokenStreamAnyChars& anyCharsAccess() {
1207 : return AnyCharsAccess::anyChars(this);
1208 0 : }
1209 296 :
1210 296 : const TokenStreamAnyChars& anyCharsAccess() const {
1211 : return AnyCharsAccess::anyChars(this);
1212 : }
1213 :
1214 : using TokenStreamSpecific = frontend::TokenStreamSpecific<CharT, AnyCharsAccess>;
1215 :
1216 16094 : TokenStreamSpecific* asSpecific() {
1217 16094 : static_assert(mozilla::IsBaseOf<GeneralTokenStreamChars, TokenStreamSpecific>::value,
1218 : "static_cast below presumes an inheritance relationship");
1219 16094 :
1220 16094 : return static_cast<TokenStreamSpecific*>(this);
1221 : }
1222 :
1223 : void newSimpleToken(TokenKind kind, TokenStart start, TokenStreamShared::Modifier modifier,
1224 : TokenKind* out)
1225 : {
1226 : newToken(kind, start, modifier, out);
1227 : }
1228 :
1229 : void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
1230 212173 : TokenStreamShared::Modifier modifier, TokenKind* out)
1231 0 : {
1232 : Token* token = newToken(TokenKind::Number, start, modifier, out);
1233 : token->setNumber(dval, decimalPoint);
1234 : }
1235 :
1236 : void newAtomToken(TokenKind kind, JSAtom* atom, TokenStart start,
1237 : TokenStreamShared::Modifier modifier, TokenKind* out)
1238 3056 : {
1239 : MOZ_ASSERT(kind == TokenKind::String ||
1240 : kind == TokenKind::TemplateHead ||
1241 : kind == TokenKind::NoSubsTemplate);
1242 :
1243 : Token* token = newToken(kind, start, modifier, out);
1244 : token->setAtom(atom);
1245 : }
1246 :
1247 : void newNameToken(PropertyName* name, TokenStart start, TokenStreamShared::Modifier modifier,
1248 : TokenKind* out)
1249 : {
1250 : Token* token = newToken(TokenKind::Name, start, modifier, out);
1251 : token->setName(name);
1252 : }
1253 :
1254 : void newRegExpToken(RegExpFlag reflags, TokenStart start, TokenKind* out)
1255 : {
1256 : Token* token = newToken(TokenKind::RegExp, start, TokenStreamShared::Operand, out);
1257 : token->setRegExpFlags(reflags);
1258 : }
1259 :
1260 : MOZ_COLD bool badToken();
1261 :
1262 3056 : /**
1263 : * Get the next code unit -- the next numeric sub-unit of source text,
1264 : * possibly smaller than a full code point -- without updating line/column
1265 : * counters or consuming LineTerminatorSequences.
1266 : *
1267 : * Because of these limitations, only use this if (a) the resulting code
1268 : * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
1269 : * and (b) the line-related state (lineno, linebase) is not used before
1270 : * it's ungotten.
1271 : */
1272 : int32_t getCodeUnit() {
1273 : if (MOZ_LIKELY(!sourceUnits.atEnd()))
1274 : return sourceUnits.getCodeUnit();
1275 :
1276 : anyCharsAccess().flags.isEOF = true;
1277 : return EOF;
1278 : }
1279 :
1280 : void ungetCodeUnit(int32_t c) {
1281 0 : MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);
1282 5266800 :
1283 2633400 : CharsSharedBase::ungetCodeUnit(c);
1284 : }
1285 0 :
1286 0 : void ungetChar(int32_t c);
1287 :
1288 : /**
1289 : * Consume characters til EOL/EOF following the start of a single-line
1290 : * comment, without consuming the EOL/EOF.
1291 : */
1292 : void consumeRestOfSingleLineComment();
1293 :
1294 : MOZ_MUST_USE MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
1295 : return anyCharsAccess().internalUpdateLineInfoForEOL(sourceUnits.offset());
1296 : }
1297 0 :
1298 2694 : protected:
1299 : uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
1300 : bool matchUnicodeEscapeIdent(uint32_t* codePoint);
1301 : };
1302 :
1303 : template<typename CharT, class AnyCharsAccess> class TokenStreamChars;
1304 :
1305 : template<class AnyCharsAccess>
1306 : class TokenStreamChars<char16_t, AnyCharsAccess>
1307 : : public GeneralTokenStreamChars<char16_t, AnyCharsAccess>
1308 : {
1309 : private:
1310 : using Self = TokenStreamChars<char16_t, AnyCharsAccess>;
1311 1224565 : using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
1312 2449130 : using CharsSharedBase = TokenStreamCharsBase<char16_t>;
1313 :
1314 1224565 : using GeneralCharsBase::asSpecific;
1315 :
1316 : using typename GeneralCharsBase::TokenStreamSpecific;
1317 1224577 :
1318 0 : protected:
1319 0 : using GeneralCharsBase::anyCharsAccess;
1320 1224577 : using GeneralCharsBase::getCodeUnit;
1321 1195995 : using CharsSharedBase::isAsciiCodePoint;
1322 1195995 : using GeneralCharsBase::sourceUnits;
1323 : using CharsSharedBase::ungetCodeUnit;
1324 : using GeneralCharsBase::updateLineInfoForEOL;
1325 28582 :
1326 28582 : using typename GeneralCharsBase::SourceUnits;
1327 28585 :
1328 : using GeneralCharsBase::GeneralCharsBase;
1329 0 :
1330 : // Try to get the next code point, normalizing '\r', '\r\n', '\n', and the
1331 : // Unicode line/paragraph separators into '\n'. Also updates internal
1332 : // line-counter state. Return true on success and store the character in
1333 : // |*c|. Return false and leave |*c| undefined on failure.
1334 : MOZ_MUST_USE bool getCodePoint(int32_t* cp);
1335 :
1336 : // A deprecated alias for |getCodePoint|: most code using this is being
1337 : // replaced with different approaches.
1338 : MOZ_MUST_USE bool getChar(int32_t* cp) {
1339 : return getCodePoint(cp);
1340 : }
1341 :
1342 : /**
1343 : * Given a just-consumed ASCII code unit/point |lead|, consume a full code
1344 : * point or LineTerminatorSequence (normalizing it to '\n') and store it in
1345 : * |*codePoint|. Return true on success, otherwise return false and leave
1346 : * |*codePoint| undefined on failure.
1347 : *
1348 : * If a LineTerminatorSequence was consumed, also update line/column info.
1349 : *
1350 : * This may change the current |sourceUnits| offset.
1351 : */
1352 : MOZ_MUST_USE bool getFullAsciiCodePoint(char16_t lead, int32_t* codePoint) {
1353 : MOZ_ASSERT(isAsciiCodePoint(lead),
1354 : "non-ASCII code units must be handled separately");
1355 : MOZ_ASSERT(lead == sourceUnits.previousCodeUnit(),
1356 : "getFullAsciiCodePoint called incorrectly");
1357 :
1358 : if (MOZ_UNLIKELY(lead == '\r')) {
1359 : if (MOZ_LIKELY(!sourceUnits.atEnd()))
1360 : sourceUnits.matchCodeUnit('\n');
1361 : } else if (MOZ_LIKELY(lead != '\n')) {
1362 : *codePoint = lead;
1363 : return true;
1364 : }
1365 :
1366 : *codePoint = '\n';
1367 : bool ok = updateLineInfoForEOL();
1368 : if (!ok) {
1369 : #ifdef DEBUG
1370 : *codePoint = EOF; // sentinel value to hopefully cause errors
1371 : #endif
1372 : MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1373 : }
1374 : return ok;
1375 : }
1376 :
1377 : /**
1378 : * Given a just-consumed non-ASCII code unit (and maybe point) |lead|,
1379 : * consume a full code point or LineTerminatorSequence (normalizing it to
1380 : * '\n') and store it in |*codePoint|. Return true on success, otherwise
1381 : * return false and leave |*codePoint| undefined on failure.
1382 : *
1383 : * If a LineTerminatorSequence was consumed, also update line/column info.
1384 : *
1385 : * This may change the current |sourceUnits| offset.
1386 : */
1387 : MOZ_MUST_USE bool getNonAsciiCodePoint(char16_t lead, int32_t* cp);
1388 :
1389 : /**
1390 : * Unget a full code point (ASCII or not) without altering line/column
1391 : * state. If line/column state must be updated, this must happen manually.
1392 : * This method ungets a single code point, not a LineTerminatorSequence
1393 : * that is multiple code points. (Generally you shouldn't be in a state
1394 : * where you've just consumed "\r\n" and want to unget that full sequence.)
1395 : *
1396 : * This function ordinarily should be used to unget code points that have
1397 : * been consumed *without* line/column state having been updated.
1398 : */
1399 : void ungetCodePointIgnoreEOL(uint32_t codePoint);
1400 3056 :
1401 : /**
1402 : * Unget an originally non-ASCII, normalized code point, including undoing
1403 : * line/column updates that were performed for it. Don't use this if the
1404 : * code point was gotten *without* line/column state being updated!
1405 : */
1406 : void ungetNonAsciiNormalizedCodePoint(uint32_t codePoint) {
1407 : MOZ_ASSERT_IF(isAsciiCodePoint(codePoint),
1408 : codePoint == '\n');
1409 : MOZ_ASSERT(codePoint != unicode::LINE_SEPARATOR,
1410 : "should not be ungetting un-normalized code points");
1411 : MOZ_ASSERT(codePoint != unicode::PARA_SEPARATOR,
1412 : "should not be ungetting un-normalized code points");
1413 :
1414 : ungetCodePointIgnoreEOL(codePoint);
1415 : if (codePoint == '\n')
1416 : anyCharsAccess().undoInternalUpdateLineInfoForEOL();
1417 : }
1418 :
1419 : /**
1420 : * Unget a just-gotten LineTerminator sequence: '\r', '\n', '\r\n', or
1421 : * a Unicode line/paragraph separator, also undoing line/column information
1422 : * changes reflecting that LineTerminator.
1423 : */
1424 : void ungetLineTerminator();
1425 : };
1426 :
1427 : // TokenStream is the lexical scanner for JavaScript source text.
1428 : //
1429 : // It takes a buffer of CharT characters (currently only char16_t encoding
1430 : // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
1431 : // linearly scans it into |Token|s.
1432 : //
1433 : // Internally the class uses a four element circular buffer |tokens| of
1434 : // |Token|s. As an index for |tokens|, the member |cursor_| points to the
1435 : // current token. Calls to getToken() increase |cursor_| by one and return the
1436 : // new current token. If a TokenStream was just created, the current token is
1437 : // uninitialized. It's therefore important that one of the first four member
1438 : // functions listed below is called first. The circular buffer lets us go back
1439 : // up to two tokens from the last scanned token. Internally, the relative
1440 : // number of backward steps that were taken (via ungetToken()) after the last
1441 : // token was scanned is stored in |lookahead|.
1442 : //
1443 : // The following table lists in which situations it is safe to call each listed
1444 : // function. No checks are made by the functions in non-debug builds.
1445 : //
1446 : // Function Name | Precondition; changes to |lookahead|
1447 : // ------------------+---------------------------------------------------------
1448 : // getToken | none; if |lookahead > 0| then |lookahead--|
1449 : // peekToken | none; if |lookahead == 0| then |lookahead == 1|
1450 : // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
1451 : // matchToken | none; if |lookahead > 0| and the match succeeds then
1452 : // | |lookahead--|
1453 : // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
1454 : // ungetToken | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
1455 : //
1456 : // The behavior of the token scanning process (see getTokenInternal()) can be
1457 : // modified by calling one of the first four above listed member functions with
1458 : // an optional argument of type Modifier. However, the modifier will be
1459 : // ignored unless |lookahead == 0| holds. Due to constraints of the grammar,
1460 : // this turns out not to be a problem in practice. See the
1461 : // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
1462 : // for more details:
1463 : // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
1464 : //
1465 : // The method seek() allows rescanning from a previously visited location of
1466 0 : // the buffer, initially computed by constructing a Position local variable.
1467 0 : //
1468 : template<typename CharT, class AnyCharsAccess>
1469 : class MOZ_STACK_CLASS TokenStreamSpecific
1470 0 : : public TokenStreamChars<CharT, AnyCharsAccess>,
1471 0 : public TokenStreamShared,
1472 0 : public ErrorReporter
1473 : {
1474 : public:
1475 : using CharsBase = TokenStreamChars<CharT, AnyCharsAccess>;
1476 : using GeneralCharsBase = GeneralTokenStreamChars<CharT, AnyCharsAccess>;
1477 816 : using CharsSharedBase = TokenStreamCharsBase<CharT>;
1478 1632 :
1479 : using Position = TokenStreamPosition<CharT>;
1480 :
1481 : // Anything inherited through a base class whose type depends upon this
1482 408 : // class's template parameters can only be accessed through a dependent
1483 816 : // name: prefixed with |this|, by explicit qualification, and so on. (This
1484 408 : // is so that references to inherited fields are statically distinguishable
1485 : // from references to names outside of the class.) This is tedious and
1486 : // onerous.
1487 : //
1488 640249 : // As an alternative, we directly add every one of these functions to this
1489 1280498 : // class, using explicit qualification to address the dependent-name
1490 : // problem. |this| or other qualification is no longer necessary -- at
1491 144125 : // cost of this ever-changing laundry list of |using|s. So it goes.
1492 288250 : public:
1493 : using GeneralCharsBase::anyCharsAccess;
1494 184123 : using CharsSharedBase::getTokenbuf;
1495 368246 :
1496 : private:
1497 : using typename CharsSharedBase::CharBuffer;
1498 : using typename CharsSharedBase::SourceUnits;
1499 :
1500 0 : private:
1501 0 : using CharsSharedBase::appendCodePointToTokenbuf;
1502 0 : using CharsSharedBase::atomizeChars;
1503 : using GeneralCharsBase::badToken;
1504 26898 : using CharsSharedBase::consumeKnownCodeUnit;
1505 53796 : using GeneralCharsBase::consumeRestOfSingleLineComment;
1506 : using CharsSharedBase::copyTokenbufTo;
1507 : using CharsSharedBase::fillWithTemplateStringContents;
1508 : using CharsBase::getChar;
1509 : using CharsBase::getCodePoint;
1510 : using GeneralCharsBase::getCodeUnit;
1511 : using CharsBase::getFullAsciiCodePoint;
1512 : using CharsBase::getNonAsciiCodePoint;
1513 : using CharsSharedBase::isAsciiCodePoint;
1514 : using CharsSharedBase::matchCodeUnit;
1515 : using GeneralCharsBase::matchUnicodeEscapeIdent;
1516 : using GeneralCharsBase::matchUnicodeEscapeIdStart;
1517 : using GeneralCharsBase::newAtomToken;
1518 : using GeneralCharsBase::newNameToken;
1519 : using GeneralCharsBase::newNumberToken;
1520 : using GeneralCharsBase::newRegExpToken;
1521 : using GeneralCharsBase::newSimpleToken;
1522 : using CharsSharedBase::peekCodeUnit;
1523 : using CharsSharedBase::sourceUnits;
1524 : using CharsSharedBase::tokenbuf;
1525 : using GeneralCharsBase::ungetChar;
1526 : using CharsBase::ungetCodePointIgnoreEOL;
1527 : using CharsSharedBase::ungetCodeUnit;
1528 : using CharsBase::ungetNonAsciiNormalizedCodePoint;
1529 : using GeneralCharsBase::updateLineInfoForEOL;
1530 :
1531 : template<typename CharU> friend class TokenStreamPosition;
1532 :
1533 : public:
1534 : TokenStreamSpecific(JSContext* cx, const ReadOnlyCompileOptions& options,
1535 : const CharT* base, size_t length);
1536 :
1537 : // If there is an invalid escape in a template, report it and return false,
1538 : // otherwise return true.
1539 : bool checkForInvalidTemplateEscapeError() {
1540 : if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None)
1541 : return true;
1542 :
1543 13 : reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
1544 0 : anyCharsAccess().invalidTemplateEscapeType);
1545 : return false;
1546 20 : }
1547 :
1548 0 : // ErrorReporter API.
1549 :
1550 0 : const JS::ReadOnlyCompileOptions& options() const final {
1551 : return anyCharsAccess().options();
1552 0 : }
1553 :
1554 : void
1555 0 : lineAndColumnAt(size_t offset, uint32_t* line, uint32_t* column) const final {
1556 : anyCharsAccess().lineAndColumnAt(offset, line, column);
1557 : }
1558 0 :
1559 13 : void currentLineAndColumn(uint32_t* line, uint32_t* column) const final;
1560 :
1561 : bool isOnThisLine(size_t offset, uint32_t lineNum, bool *onThisLine) const final {
1562 39 : return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
1563 : }
1564 : uint32_t lineAt(size_t offset) const final {
1565 : return anyCharsAccess().srcCoords.lineNum(offset);
1566 : }
1567 : uint32_t columnAt(size_t offset) const final {
1568 : return anyCharsAccess().srcCoords.columnIndex(offset);
1569 : }
1570 0 :
1571 0 : bool hasTokenizationStarted() const final;
1572 :
1573 0 : void reportErrorNoOffsetVA(unsigned errorNumber, va_list args) final {
1574 : anyCharsAccess().reportErrorNoOffsetVA(errorNumber, args);
1575 : }
1576 0 :
1577 0 : const char* getFilename() const final {
1578 : return anyCharsAccess().getFilename();
1579 0 : }
1580 0 :
1581 : // TokenStream-specific error reporters.
1582 0 : void reportError(unsigned errorNumber, ...);
1583 0 :
1584 : // Report the given error at the current offset.
1585 0 : void error(unsigned errorNumber, ...);
1586 0 :
1587 : // Report the given error at the given offset.
1588 : void errorAt(uint32_t offset, unsigned errorNumber, ...);
1589 : void errorAtVA(uint32_t offset, unsigned errorNumber, va_list* args);
1590 :
1591 : // Warn at the current offset.
1592 : MOZ_MUST_USE bool warning(unsigned errorNumber, ...);
1593 :
1594 : private:
1595 : // Compute a line of context for an otherwise-filled-in |err| at the given
1596 : // offset in this token stream. (This function basically exists to make
1597 : // |computeErrorMetadata| more readable and shouldn't be called elsewhere.)
1598 : MOZ_MUST_USE bool computeLineOfContext(ErrorMetadata* err, uint32_t offset);
1599 :
1600 : public:
1601 : // Compute error metadata for an error at the given offset.
1602 : MOZ_MUST_USE bool computeErrorMetadata(ErrorMetadata* err, uint32_t offset);
1603 :
1604 : // General-purpose error reporters. You should avoid calling these
1605 : // directly, and instead use the more succinct alternatives (error(),
1606 : // warning(), &c.) in TokenStream, Parser, and BytecodeEmitter.
1607 : //
1608 : // These functions take a |va_list*| parameter, not a |va_list| parameter,
1609 : // to hack around bug 1363116. (Longer-term, the right fix is of course to
1610 : // not use ellipsis functions or |va_list| at all in error reporting.)
1611 : bool reportStrictModeErrorNumberVA(UniquePtr<JSErrorNotes> notes, uint32_t offset,
1612 : bool strictMode, unsigned errorNumber, va_list* args);
1613 : bool reportExtraWarningErrorNumberVA(UniquePtr<JSErrorNotes> notes, uint32_t offset,
1614 : unsigned errorNumber, va_list* args);
1615 :
1616 : JSAtom* getRawTemplateStringAtom() {
1617 : TokenStreamAnyChars& anyChars = anyCharsAccess();
1618 :
1619 : MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
1620 : anyChars.currentToken().type == TokenKind::NoSubsTemplate);
1621 : const CharT* cur = sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
1622 : const CharT* end;
1623 : if (anyChars.currentToken().type == TokenKind::TemplateHead) {
1624 : // Of the form |`...${| or |}...${|
1625 : end = sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
1626 : } else {
1627 : // NO_SUBS_TEMPLATE is of the form |`...`| or |}...`|
1628 : end = sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
1629 : }
1630 :
1631 : CharBuffer charbuf(anyChars.cx);
1632 : if (!fillWithTemplateStringContents(charbuf, cur, end))
1633 : return nullptr;
1634 :
1635 : return atomizeChars(anyChars.cx, charbuf.begin(), charbuf.length());
1636 : }
1637 :
1638 : private:
1639 0 : // This is private because it should only be called by the tokenizer while
1640 : // tokenizing not by, for example, BytecodeEmitter.
1641 0 : bool reportStrictModeError(unsigned errorNumber, ...);
1642 0 :
1643 1227720 : void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
1644 0 : switch (type) {
1645 1227720 : case InvalidEscapeType::None:
1646 0 : MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
1647 1227720 : return;
1648 1227720 : case InvalidEscapeType::Hexadecimal:
1649 0 : errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
1650 0 : return;
1651 : case InvalidEscapeType::Unicode:
1652 : errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
1653 557514 : return;
1654 : case InvalidEscapeType::UnicodeOverflow:
1655 : errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
1656 355388 : return;
1657 710776 : case InvalidEscapeType::Octal:
1658 355388 : errorAt(offset, JSMSG_DEPRECATED_OCTAL);
1659 53497 : return;
1660 53497 : }
1661 53497 : }
1662 53497 :
1663 : MOZ_MUST_USE bool putIdentInTokenbuf(const CharT* identStart);
1664 301891 :
1665 : /**
1666 0 : * Tokenize a decimal number that begins at |numStart| into the provided
1667 301892 : * token.
1668 : *
1669 : * |unit| must be one of these values:
1670 0 : *
1671 124510 : * 1. The first decimal digit in the integral part of a decimal number
1672 62255 : * not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
1673 : * '8' for "8.675309e6".
1674 24558 : *
1675 0 : * In this case, the next |getCodeUnit()| must return the code unit after
1676 24558 : * |unit| in the overall number.
1677 24558 : *
1678 : * 2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
1679 37697 : * "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
1680 37697 : *
1681 : * In this case, the next |getCodeUnit()| must return the code unit
1682 62255 : * *after* the first decimal digit *after* the '.'. So the next code
1683 62255 : * unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
1684 : * "0.5/2" (three separate tokens).
1685 : *
1686 0 : * 3. The code unit after the '0' where "0" is the entire number token.
1687 16463 : *
1688 16463 : * In this case, the next |getCodeUnit()| would return the code unit
1689 : * after |unit|, but this function will never perform such call.
1690 16463 : *
1691 0 : * 4. (Non-strict mode code only) The first '8' or '9' in a "noctal"
1692 : * number that begins with a '0' but contains a non-octal digit in its
1693 : * integer part so is interpreted as decimal, e.g. '9' in "09.28" or
1694 : * '8' in "0386" or '9' in "09+7" (three separate tokens").
1695 : *
1696 : * In this case, the next |getCodeUnit()| returns the code unit after
1697 : * |unit|: '.', '6', or '+' in the examples above.
1698 : *
1699 : * This interface is super-hairy and horribly stateful. Unfortunately, its
1700 : * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
1701 0 : * And incredibly, it *improves* on the goto-based horror that predated it.
1702 638718 : */
1703 0 : MOZ_MUST_USE bool decimalNumber(int32_t unit, TokenStart start, const CharT* numStart,
1704 : Modifier modifier, TokenKind* out);
1705 :
1706 : /** Tokenize a regular expression literal beginning at |start|. */
1707 : MOZ_MUST_USE bool regexpLiteral(TokenStart start, TokenKind* out);
1708 :
1709 : public:
1710 319359 : // Advance to the next token. If the token stream encountered an error,
1711 : // return false. Otherwise return true and store the token kind in |*ttp|.
1712 0 : MOZ_MUST_USE bool getToken(TokenKind* ttp, Modifier modifier = None) {
1713 0 : // Check for a pushed-back token resulting from mismatching lookahead.
1714 0 : TokenStreamAnyChars& anyChars = anyCharsAccess();
1715 : if (anyChars.lookahead != 0) {
1716 : MOZ_ASSERT(!anyChars.flags.hadError);
1717 305517 : anyChars.lookahead--;
1718 0 : anyChars.advanceCursor();
1719 0 : TokenKind tt = anyChars.currentToken().type;
1720 303184 : MOZ_ASSERT(tt != TokenKind::Eol);
1721 303183 : verifyConsistentModifier(modifier, anyChars.currentToken());
1722 : *ttp = tt;
1723 : return true;
1724 : }
1725 :
1726 : return getTokenInternal(ttp, modifier);
1727 : }
1728 :
1729 : MOZ_MUST_USE bool peekToken(TokenKind* ttp, Modifier modifier = None) {
1730 : TokenStreamAnyChars& anyChars = anyCharsAccess();
1731 : if (anyChars.lookahead > 0) {
1732 : MOZ_ASSERT(!anyChars.flags.hadError);
1733 16181 : verifyConsistentModifier(modifier, anyChars.nextToken());
1734 : *ttp = anyChars.nextToken().type;
1735 16189 : return true;
1736 16189 : }
1737 : if (!getTokenInternal(ttp, modifier))
1738 16189 : return false;
1739 48567 : anyChars.ungetToken();
1740 : return true;
1741 : }
1742 16189 :
1743 : MOZ_MUST_USE bool peekTokenPos(TokenPos* posp, Modifier modifier = None) {
1744 : TokenStreamAnyChars& anyChars = anyCharsAccess();
1745 : if (anyChars.lookahead == 0) {
1746 0 : TokenKind tt;
1747 : if (!getTokenInternal(&tt, modifier))
1748 0 : return false;
1749 : anyChars.ungetToken();
1750 0 : MOZ_ASSERT(anyChars.hasLookahead());
1751 0 : } else {
1752 : MOZ_ASSERT(!anyChars.flags.hadError);
1753 0 : verifyConsistentModifier(modifier, anyChars.nextToken());
1754 0 : }
1755 : *posp = anyChars.nextToken().pos;
1756 : return true;
1757 : }
1758 :
1759 4707 : MOZ_MUST_USE bool peekOffset(uint32_t* offset, Modifier modifier = None) {
1760 : TokenPos pos;
1761 9414 : if (!peekTokenPos(&pos, modifier))
1762 4707 : return false;
1763 4707 : *offset = pos.begin;
1764 4707 : return true;
1765 : }
1766 0 :
1767 : // This is like peekToken(), with one exception: if there is an EOL
1768 124256 : // between the end of the current token and the start of the next token, it
1769 : // return true and store Eol in |*ttp|. In that case, no token with
1770 : // Eol is actually created, just a Eol TokenKind is returned, and
1771 248512 : // currentToken() shouldn't be consulted. (This is the only place Eol
1772 124256 : // is produced.)
1773 : MOZ_ALWAYS_INLINE MOZ_MUST_USE bool
1774 : peekTokenSameLine(TokenKind* ttp, Modifier modifier = None) {
1775 : TokenStreamAnyChars& anyChars = anyCharsAccess();
1776 : const Token& curr = anyChars.currentToken();
1777 0 :
1778 : // If lookahead != 0, we have scanned ahead at least one token, and
1779 : // |lineno| is the line that the furthest-scanned token ends on. If
1780 : // it's the same as the line that the current token ends on, that's a
1781 : // stronger condition than what we are looking for, and we don't need
1782 : // to return Eol.
1783 : if (anyChars.lookahead != 0) {
1784 : bool onThisLine;
1785 : if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno, &onThisLine)) {
1786 : reportError(JSMSG_OUT_OF_MEMORY);
1787 0 : return false;
1788 0 : }
1789 :
1790 : if (onThisLine) {
1791 0 : MOZ_ASSERT(!anyChars.flags.hadError);
1792 0 : verifyConsistentModifier(modifier, anyChars.nextToken());
1793 : *ttp = anyChars.nextToken().type;
1794 : return true;
1795 : }
1796 : }
1797 :
1798 : // The above check misses two cases where we don't have to return
1799 : // Eol.
1800 : // - The next token starts on the same line, but is a multi-line token.
1801 : // - The next token starts on the same line, but lookahead==2 and there
1802 : // is a newline between the next token and the one after that.
1803 : // The following test is somewhat expensive but gets these cases (and
1804 : // all others) right.
1805 : TokenKind tmp;
1806 : if (!getToken(&tmp, modifier))
1807 : return false;
1808 : const Token& next = anyChars.currentToken();
1809 : anyChars.ungetToken();
1810 :
1811 : const auto& srcCoords = anyChars.srcCoords;
1812 : *ttp = srcCoords.lineNum(curr.pos.end) == srcCoords.lineNum(next.pos.begin)
1813 : ? next.type
1814 : : TokenKind::Eol;
1815 : return true;
1816 : }
1817 :
1818 : // Get the next token from the stream if its kind is |tt|.
1819 : MOZ_MUST_USE bool matchToken(bool* matchedp, TokenKind tt, Modifier modifier = None) {
1820 : TokenKind token;
1821 : if (!getToken(&token, modifier))
1822 : return false;
1823 : if (token == tt) {
1824 : *matchedp = true;
1825 : } else {
1826 : anyCharsAccess().ungetToken();
1827 : *matchedp = false;
1828 : }
1829 : return true;
1830 : }
1831 :
1832 : void consumeKnownToken(TokenKind tt, Modifier modifier = None) {
1833 : bool matched;
1834 : MOZ_ASSERT(anyCharsAccess().hasLookahead());
1835 : MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
1836 : MOZ_ALWAYS_TRUE(matched);
1837 : }
1838 :
1839 : MOZ_MUST_USE bool nextTokenEndsExpr(bool* endsExpr) {
1840 : TokenKind tt;
1841 : if (!peekToken(&tt))
1842 : return false;
1843 :
1844 : *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
1845 : if (*endsExpr) {
1846 : // If the next token ends an overall Expression, we'll parse this
1847 : // Expression without ever invoking Parser::orExpr(). But we need
1848 : // that function's side effect of adding this modifier exception,
1849 : // so we have to do it manually here.
1850 : anyCharsAccess().addModifierException(OperandIsNone);
1851 : }
1852 : return true;
1853 : }
1854 :
1855 : MOZ_MUST_USE bool advance(size_t position);
1856 :
1857 : void seek(const Position& pos);
1858 : MOZ_MUST_USE bool seek(const Position& pos, const TokenStreamAnyChars& other);
1859 :
1860 : const CharT* codeUnitPtrAt(size_t offset) const {
1861 : return sourceUnits.codeUnitPtrAt(offset);
1862 : }
1863 :
1864 : const CharT* rawLimit() const {
1865 : return sourceUnits.limit();
1866 : }
1867 :
1868 : MOZ_MUST_USE bool identifierName(TokenStart start, const CharT* identStart,
1869 : IdentifierEscapes escaping, Modifier modifier,
1870 : TokenKind* out);
1871 :
1872 : MOZ_MUST_USE bool getTokenInternal(TokenKind* const ttp, const Modifier modifier);
1873 :
1874 : MOZ_MUST_USE bool getStringOrTemplateToken(char untilChar, Modifier modifier, TokenKind* out);
1875 :
1876 : MOZ_MUST_USE bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
1877 : MOZ_MUST_USE bool getDirective(bool isMultiline, bool shouldWarnDeprecated,
1878 : const char* directive, uint8_t directiveLength,
1879 : const char* errorMsgPragma,
1880 : UniquePtr<char16_t[], JS::FreePolicy>* destination);
1881 : MOZ_MUST_USE bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
1882 : MOZ_MUST_USE bool getSourceMappingURL(bool isMultiline, bool shouldWarnDeprecated);
1883 : };
1884 :
1885 : // It's preferable to define this in TokenStream.cpp, but its template-ness
1886 : // means we'd then have to *instantiate* this constructor for all possible
1887 : // (CharT, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
1888 : // *itself* is templated. This symbol really isn't that huge compared to some
1889 : // defined inline in TokenStreamSpecific, so just rely on the linker commoning
1890 : // stuff up.
1891 : template<typename CharT>
1892 : template<class AnyCharsAccess>
1893 : inline
1894 : TokenStreamPosition<CharT>::TokenStreamPosition(AutoKeepAtoms& keepAtoms,
1895 : TokenStreamSpecific<CharT, AnyCharsAccess>& tokenStream)
1896 : {
1897 : TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();
1898 :
1899 : buf = tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
1900 : flags = anyChars.flags;
1901 : lineno = anyChars.lineno;
1902 : linebase = anyChars.linebase;
1903 : prevLinebase = anyChars.prevLinebase;
1904 : lookahead = anyChars.lookahead;
1905 : currentToken = anyChars.currentToken();
1906 : for (unsigned i = 0; i < anyChars.lookahead; i++)
1907 : lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
1908 : }
1909 :
1910 : class TokenStreamAnyCharsAccess
1911 : {
1912 : public:
1913 : template<class TokenStreamSpecific>
1914 : static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);
1915 :
1916 : template<class TokenStreamSpecific>
1917 : static inline const TokenStreamAnyChars& anyChars(const TokenStreamSpecific* tss);
1918 : };
1919 :
1920 : class MOZ_STACK_CLASS TokenStream final
1921 : : public TokenStreamAnyChars,
1922 : public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>
1923 : {
1924 : using CharT = char16_t;
1925 :
1926 : public:
1927 : TokenStream(JSContext* cx, const ReadOnlyCompileOptions& options,
1928 : const CharT* base, size_t length, StrictModeGetter* smg)
1929 : : TokenStreamAnyChars(cx, options, smg),
1930 : TokenStreamSpecific<CharT, TokenStreamAnyCharsAccess>(cx, options, base, length)
1931 : {}
1932 : };
1933 :
1934 : template<class TokenStreamSpecific>
1935 : /* static */ inline TokenStreamAnyChars&
1936 : TokenStreamAnyCharsAccess::anyChars(TokenStreamSpecific* tss)
1937 : {
1938 : auto* ts = static_cast<TokenStream*>(tss);
1939 : return *static_cast<TokenStreamAnyChars*>(ts);
1940 : }
1941 :
1942 : template<class TokenStreamSpecific>
1943 : /* static */ inline const TokenStreamAnyChars&
1944 : TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss)
1945 : {
1946 : const auto* ts = static_cast<const TokenStream*>(tss);
1947 : return *static_cast<const TokenStreamAnyChars*>(ts);
1948 : }
1949 :
1950 : extern const char*
1951 : TokenKindToDesc(TokenKind tt);
1952 :
1953 : } // namespace frontend
1954 : } // namespace js
1955 :
1956 : extern JS_FRIEND_API(int)
1957 : js_fgets(char* buf, int size, FILE* file);
1958 :
1959 : #ifdef DEBUG
1960 : extern const char*
1961 : TokenKindToString(js::frontend::TokenKind tt);
1962 : #endif
1963 :
1964 : #endif /* frontend_TokenStream_h */
|