Rework TokenStream

This commit is contained in:
litetex 2025-03-05 16:07:05 +01:00
parent 77ee25e3b6
commit 0c3ac0a308
No known key found for this signature in database
GPG Key ID: 525B43E6039B3689
2 changed files with 37 additions and 158 deletions

View File

@ -1,17 +1,28 @@
package org.schabi.newpipe.extractor.utils.jsextractor; /*
* Source: Mozilla Rhino, org.mozilla.javascript.TokenStream
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Kit;
import org.mozilla.javascript.ScriptRuntime;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
/* Source: Mozilla Rhino, org.mozilla.javascript.Token
* *
* This Source Code Form is subject to the terms of the Mozilla Public * This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
* */ *
class TokenStream { */
package org.schabi.newpipe.extractor.utils.jsextractor;
import org.mozilla.javascript.Kit;
import org.mozilla.javascript.ScriptRuntime;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
/**
* Based on Mozilla Rhino's (v1.7.14) org.mozilla.javascript.TokenStream
* <p/>
* Changes:
* <ul>
* <li>Tailored for {@link Lexer}</li>
* <li>Removed all not needed code to improve performance</li>
* <li>Optimized for ECMAScript6/2015</li>
* </ul>
*/
class EcmaScriptTokenStream {
/* /*
* For chars - because we need something out-of-range * For chars - because we need something out-of-range
* to check. (And checking EOF by exception is annoying.) * to check. (And checking EOF by exception is annoying.)
@ -28,125 +39,17 @@ class TokenStream {
private static final char BYTE_ORDER_MARK = '\uFEFF'; private static final char BYTE_ORDER_MARK = '\uFEFF';
private static final char NUMERIC_SEPARATOR = '_'; private static final char NUMERIC_SEPARATOR = '_';
TokenStream(final String sourceString, final int lineno, final int languageVersion) { EcmaScriptTokenStream(final String sourceString, final int lineno, final boolean strictMode) {
this.sourceString = sourceString; this.sourceString = sourceString;
this.sourceCursor = 0; this.sourceCursor = 0;
this.cursor = 0; this.cursor = 0;
this.lineno = lineno; this.lineno = lineno;
this.languageVersion = languageVersion; this.strictMode = strictMode;
} }
private static Token stringToKeyword( private Token stringToKeyword(final String name) {
final String name, return stringToKeywordForES(name, strictMode);
final int version,
final boolean isStrict) {
if (version < Context.VERSION_ES6) {
return stringToKeywordForJS(name);
}
return stringToKeywordForES(name, isStrict);
}
/** JavaScript 1.8 and earlier */
private static Token stringToKeywordForJS(final String name) {
switch (name) {
case "break":
return Token.BREAK;
case "case":
return Token.CASE;
case "continue":
return Token.CONTINUE;
case "default":
return Token.DEFAULT;
case "delete":
return Token.DELPROP;
case "do":
return Token.DO;
case "else":
return Token.ELSE;
case "export":
return Token.EXPORT;
case "false":
return Token.FALSE;
case "for":
return Token.FOR;
case "function":
return Token.FUNCTION;
case "if":
return Token.IF;
case "in":
return Token.IN;
case "let":
return Token.LET;
case "new":
return Token.NEW;
case "null":
return Token.NULL;
case "return":
return Token.RETURN;
case "switch":
return Token.SWITCH;
case "this":
return Token.THIS;
case "true":
return Token.TRUE;
case "typeof":
return Token.TYPEOF;
case "var":
return Token.VAR;
case "void":
return Token.VOID;
case "while":
return Token.WHILE;
case "with":
return Token.WITH;
case "yield":
return Token.YIELD;
case "throw":
return Token.THROW;
case "catch":
return Token.CATCH;
case "const":
return Token.CONST;
case "debugger":
return Token.DEBUGGER;
case "finally":
return Token.FINALLY;
case "instanceof":
return Token.INSTANCEOF;
case "try":
return Token.TRY;
case "abstract":
case "boolean":
case "byte":
case "char":
case "class":
case "double":
case "enum":
case "extends":
case "final":
case "float":
case "goto":
case "implements":
case "import":
case "int":
case "interface":
case "long":
case "native":
case "package":
case "private":
case "protected":
case "public":
case "short":
case "static":
case "super":
case "synchronized":
case "throws":
case "transient":
case "volatile":
return Token.RESERVED;
}
return Token.EOF;
} }
/** ECMAScript 6. */ /** ECMAScript 6. */
@ -346,19 +249,9 @@ class TokenStream {
// check if it's a keyword. // check if it's a keyword.
// Return the corresponding token if it's a keyword // Return the corresponding token if it's a keyword
Token result = stringToKeyword(str, languageVersion, STRICT_MODE); final Token result = stringToKeyword(str);
if (result != Token.EOF) { if (result != Token.EOF) {
if ((result == Token.LET || result == Token.YIELD) return result; // Always needed due to ECMAScript
&& languageVersion < Context.VERSION_1_7) {
result = Token.NAME;
}
// Save the string in case we need to use in
// object literal definitions.
if (result != Token.RESERVED
|| languageVersion >= Context.VERSION_ES6
|| !IS_RESERVED_KEYWORD_AS_IDENTIFIER) {
return result;
}
} }
} }
return Token.NAME; return Token.NAME;
@ -368,7 +261,6 @@ class TokenStream {
if (isDigit(c) || (c == '.' && isDigit(peekChar()))) { if (isDigit(c) || (c == '.' && isDigit(peekChar()))) {
stringBufferTop = 0; stringBufferTop = 0;
int base = 10; int base = 10;
final boolean es6 = languageVersion >= Context.VERSION_ES6;
boolean isOldOctal = false; boolean isOldOctal = false;
if (c == '0') { if (c == '0') {
@ -376,10 +268,10 @@ class TokenStream {
if (c == 'x' || c == 'X') { if (c == 'x' || c == 'X') {
base = 16; base = 16;
c = getChar(); c = getChar();
} else if (es6 && (c == 'o' || c == 'O')) { } else if (c == 'o' || c == 'O') {
base = 8; base = 8;
c = getChar(); c = getChar();
} else if (es6 && (c == 'b' || c == 'B')) { } else if (c == 'b' || c == 'B') {
base = 2; base = 2;
c = getChar(); c = getChar();
} else if (isDigit(c)) { } else if (isDigit(c)) {
@ -422,7 +314,7 @@ class TokenStream {
throw new ParsingException("number format error"); throw new ParsingException("number format error");
} }
if (es6 && c == 'n') { if (c == 'n') {
c = getChar(); c = getChar();
} else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) { } else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) {
if (c == '.') { if (c == '.') {
@ -705,7 +597,7 @@ class TokenStream {
return Token.GT; return Token.GT;
case '*': case '*':
if (languageVersion >= Context.VERSION_ES6 && matchChar('*')) { if (matchChar('*')) {
if (matchChar('=')) { if (matchChar('=')) {
return Token.ASSIGN_EXP; return Token.ASSIGN_EXP;
} }
@ -1080,18 +972,16 @@ class TokenStream {
// sourceCursor is an index into a small buffer that keeps a // sourceCursor is an index into a small buffer that keeps a
// sliding window of the source stream. // sliding window of the source stream.
int sourceCursor; private int sourceCursor;
// cursor is a monotonically increasing index into the original // cursor is a monotonically increasing index into the original
// source stream, tracking exactly how far scanning has progressed. // source stream, tracking exactly how far scanning has progressed.
// Its value is the index of the next character to be scanned. // Its value is the index of the next character to be scanned.
int cursor; private int cursor;
// Record start and end positions of last scanned token. // Record start and end positions of last scanned token.
int tokenBeg; int tokenBeg;
int tokenEnd; int tokenEnd;
private final int languageVersion; private final boolean strictMode;
private static final boolean IS_RESERVED_KEYWORD_AS_IDENTIFIER = true;
private static final boolean STRICT_MODE = false;
} }

View File

@ -1,6 +1,5 @@
package org.schabi.newpipe.extractor.utils.jsextractor; package org.schabi.newpipe.extractor.utils.jsextractor;
import org.mozilla.javascript.Context;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import java.util.Stack; import java.util.Stack;
@ -119,7 +118,7 @@ public class Lexer {
} }
} }
private final TokenStream stream; private final EcmaScriptTokenStream stream;
private final LookBehind lastThree; private final LookBehind lastThree;
private final Stack<Brace> braceStack; private final Stack<Brace> braceStack;
private final Stack<Paren> parenStack; private final Stack<Paren> parenStack;
@ -128,24 +127,14 @@ public class Lexer {
* Create a new JavaScript lexer with the given source code * Create a new JavaScript lexer with the given source code
* *
* @param js JavaScript code * @param js JavaScript code
* @param languageVersion JavaScript version (from Rhino)
*/ */
public Lexer(final String js, final int languageVersion) { public Lexer(final String js) {
stream = new TokenStream(js, 0, languageVersion); stream = new EcmaScriptTokenStream(js, 0, false);
lastThree = new LookBehind(); lastThree = new LookBehind();
braceStack = new Stack<>(); braceStack = new Stack<>();
parenStack = new Stack<>(); parenStack = new Stack<>();
} }
/**
* Create a new JavaScript lexer with the given source code
*
* @param js JavaScript code
*/
public Lexer(final String js) {
this(js, Context.VERSION_DEFAULT);
}
/** /**
* Continue parsing and return the next token * Continue parsing and return the next token
* @return next token * @return next token