From 0c3ac0a30809d0aba6ce1765c3c44a23f56a0886 Mon Sep 17 00:00:00 2001 From: litetex <40789489+litetex@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:07:05 +0100 Subject: [PATCH] Rework TokenStream --- ...Stream.java => EcmaScriptTokenStream.java} | 178 ++++-------------- .../extractor/utils/jsextractor/Lexer.java | 17 +- 2 files changed, 37 insertions(+), 158 deletions(-) rename extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/{TokenStream.java => EcmaScriptTokenStream.java} (87%) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/EcmaScriptTokenStream.java similarity index 87% rename from extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java rename to extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/EcmaScriptTokenStream.java index 6533860a0..0cd5169dc 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/EcmaScriptTokenStream.java @@ -1,17 +1,28 @@ -package org.schabi.newpipe.extractor.utils.jsextractor; - -import org.mozilla.javascript.Context; -import org.mozilla.javascript.Kit; -import org.mozilla.javascript.ScriptRuntime; -import org.schabi.newpipe.extractor.exceptions.ParsingException; - -/* Source: Mozilla Rhino, org.mozilla.javascript.Token +/* + * Source: Mozilla Rhino, org.mozilla.javascript.TokenStream * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * */ -class TokenStream { + * + */ +package org.schabi.newpipe.extractor.utils.jsextractor; + +import org.mozilla.javascript.Kit; +import org.mozilla.javascript.ScriptRuntime; +import org.schabi.newpipe.extractor.exceptions.ParsingException; + +/** + * Based on Mozilla Rhino's (v1.7.14) org.mozilla.javascript.TokenStream + *

+ * Changes: + *

+ */ +class EcmaScriptTokenStream { /* * For chars - because we need something out-of-range * to check. (And checking EOF by exception is annoying.) @@ -28,125 +39,17 @@ class TokenStream { private static final char BYTE_ORDER_MARK = '\uFEFF'; private static final char NUMERIC_SEPARATOR = '_'; - TokenStream(final String sourceString, final int lineno, final int languageVersion) { + EcmaScriptTokenStream(final String sourceString, final int lineno, final boolean strictMode) { this.sourceString = sourceString; this.sourceCursor = 0; this.cursor = 0; this.lineno = lineno; - this.languageVersion = languageVersion; + this.strictMode = strictMode; } - private static Token stringToKeyword( - final String name, - final int version, - final boolean isStrict) { - if (version < Context.VERSION_ES6) { - return stringToKeywordForJS(name); - } - return stringToKeywordForES(name, isStrict); - } - - /** JavaScript 1.8 and earlier */ - private static Token stringToKeywordForJS(final String name) { - switch (name) { - case "break": - return Token.BREAK; - case "case": - return Token.CASE; - case "continue": - return Token.CONTINUE; - case "default": - return Token.DEFAULT; - case "delete": - return Token.DELPROP; - case "do": - return Token.DO; - case "else": - return Token.ELSE; - case "export": - return Token.EXPORT; - case "false": - return Token.FALSE; - case "for": - return Token.FOR; - case "function": - return Token.FUNCTION; - case "if": - return Token.IF; - case "in": - return Token.IN; - case "let": - return Token.LET; - case "new": - return Token.NEW; - case "null": - return Token.NULL; - case "return": - return Token.RETURN; - case "switch": - return Token.SWITCH; - case "this": - return Token.THIS; - case "true": - return Token.TRUE; - case "typeof": - return Token.TYPEOF; - case "var": - return Token.VAR; - case "void": - return Token.VOID; - case "while": - return Token.WHILE; - case "with": - return Token.WITH; - case "yield": - return Token.YIELD; - case "throw": - return Token.THROW; - case "catch": - return Token.CATCH; - case "const": - return Token.CONST; - case "debugger": - return Token.DEBUGGER; - case "finally": - return Token.FINALLY; - case "instanceof": - return Token.INSTANCEOF; - case "try": - return Token.TRY; - case "abstract": - case "boolean": - case "byte": - case "char": - case "class": - case "double": - case "enum": - case "extends": - case "final": - case "float": - case "goto": - case "implements": - case "import": - case "int": - case "interface": - case "long": - case "native": - case "package": - case "private": - case "protected": - case "public": - case "short": - case "static": - case "super": - case "synchronized": - case "throws": - case "transient": - case "volatile": - return Token.RESERVED; - } - return Token.EOF; + private Token stringToKeyword(final String name) { + return stringToKeywordForES(name, strictMode); } /** ECMAScript 6. */ @@ -346,19 +249,9 @@ class TokenStream { // check if it's a keyword. // Return the corresponding token if it's a keyword - Token result = stringToKeyword(str, languageVersion, STRICT_MODE); + final Token result = stringToKeyword(str); if (result != Token.EOF) { - if ((result == Token.LET || result == Token.YIELD) - && languageVersion < Context.VERSION_1_7) { - result = Token.NAME; - } - // Save the string in case we need to use in - // object literal definitions. - if (result != Token.RESERVED - || languageVersion >= Context.VERSION_ES6 - || !IS_RESERVED_KEYWORD_AS_IDENTIFIER) { - return result; - } + return result; // Always needed due to ECMAScript } } return Token.NAME; @@ -368,7 +261,6 @@ class TokenStream { if (isDigit(c) || (c == '.' && isDigit(peekChar()))) { stringBufferTop = 0; int base = 10; - final boolean es6 = languageVersion >= Context.VERSION_ES6; boolean isOldOctal = false; if (c == '0') { @@ -376,10 +268,10 @@ class TokenStream { if (c == 'x' || c == 'X') { base = 16; c = getChar(); - } else if (es6 && (c == 'o' || c == 'O')) { + } else if (c == 'o' || c == 'O') { base = 8; c = getChar(); - } else if (es6 && (c == 'b' || c == 'B')) { + } else if (c == 'b' || c == 'B') { base = 2; c = getChar(); } else if (isDigit(c)) { @@ -422,7 +314,7 @@ class TokenStream { throw new ParsingException("number format error"); } - if (es6 && c == 'n') { + if (c == 'n') { c = getChar(); } else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) { if (c == '.') { @@ -705,7 +597,7 @@ class TokenStream { return Token.GT; case '*': - if (languageVersion >= Context.VERSION_ES6 && matchChar('*')) { + if (matchChar('*')) { if (matchChar('=')) { return Token.ASSIGN_EXP; } @@ -1080,18 +972,16 @@ class TokenStream { // sourceCursor is an index into a small buffer that keeps a // sliding window of the source stream. - int sourceCursor; + private int sourceCursor; // cursor is a monotonically increasing index into the original // source stream, tracking exactly how far scanning has progressed. // Its value is the index of the next character to be scanned. - int cursor; + private int cursor; // Record start and end positions of last scanned token. int tokenBeg; int tokenEnd; - private final int languageVersion; - private static final boolean IS_RESERVED_KEYWORD_AS_IDENTIFIER = true; - private static final boolean STRICT_MODE = false; + private final boolean strictMode; } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java index 2bbcef544..c900b0a2b 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java @@ -1,6 +1,5 @@ package org.schabi.newpipe.extractor.utils.jsextractor; -import org.mozilla.javascript.Context; import org.schabi.newpipe.extractor.exceptions.ParsingException; import java.util.Stack; @@ -119,7 +118,7 @@ public class Lexer { } } - private final TokenStream stream; + private final EcmaScriptTokenStream stream; private final LookBehind lastThree; private final Stack braceStack; private final Stack parenStack; @@ -128,24 +127,14 @@ public class Lexer { * Create a new JavaScript lexer with the given source code * * @param js JavaScript code - * @param languageVersion JavaScript version (from Rhino) */ - public Lexer(final String js, final int languageVersion) { - stream = new TokenStream(js, 0, languageVersion); + public Lexer(final String js) { + stream = new EcmaScriptTokenStream(js, 0, false); lastThree = new LookBehind(); braceStack = new Stack<>(); parenStack = new Stack<>(); } - /** - * Create a new JavaScript lexer with the given source code - * - * @param js JavaScript code - */ - public Lexer(final String js) { - this(js, Context.VERSION_DEFAULT); - } - /** * Continue parsing and return the next token * @return next token