diff --git a/python/pom.xml b/python/pom.xml
index ce079ba36a..669998e1b3 100644
--- a/python/pom.xml
+++ b/python/pom.xml
@@ -16,6 +16,6 @@
python2
python3
python2_7_18
- python3_12_1
+ python3_12
diff --git a/python/python2_7_18/CSharp/PythonLexerBase.cs b/python/python2_7_18/CSharp/PythonLexerBase.cs
index 627313018d..7902984380 100644
--- a/python/python2_7_18/CSharp/PythonLexerBase.cs
+++ b/python/python2_7_18/CSharp/PythonLexerBase.cs
@@ -37,6 +37,7 @@ public abstract class PythonLexerBase : Lexer
private Stack indentLengthStack;
// A list where tokens are waiting to be loaded into the token stream
private LinkedList pendingTokens;
+
// last pending token types
private int previousPendingTokenType;
private int lastPendingTokenTypeFromDefaultChannel;
@@ -47,11 +48,11 @@ public abstract class PythonLexerBase : Lexer
private bool wasSpaceIndentation;
private bool wasTabIndentation;
private bool wasIndentationMixedWithSpacesAndTabs;
- private const int INVALID_LENGTH = -1;
- private CommonToken curToken; // current (under processing) token
- private IToken ffgToken; // following (look ahead) token
+ private IToken curToken; // current (under processing) token
+ private IToken ffgToken; // following (look ahead) token
+ private const int INVALID_LENGTH = -1;
private const string ERR_TXT = " ERROR: ";
protected PythonLexerBase(ICharStream input) : base(input)
@@ -64,6 +65,20 @@ protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter error
this.Init();
}
+ public override IToken NextToken() // reading the input stream until a return EOF
+ {
+ this.CheckNextToken();
+ IToken firstPendingToken = this.pendingTokens.First.Value;
+ this.pendingTokens.RemoveFirst();
+ return firstPendingToken; // add the queued token to the token stream
+ }
+
+ public override void Reset()
+ {
+ this.Init();
+ base.Reset();
+ }
+
private void Init()
{
this.indentLengthStack = new Stack();
@@ -78,14 +93,6 @@ private void Init()
this.ffgToken = null!;
}
- public override IToken NextToken() // reading the input stream until a return EOF
- {
- this.CheckNextToken();
- IToken firstPendingToken = this.pendingTokens.First.Value;
- this.pendingTokens.RemoveFirst();
- return firstPendingToken; // add the queued token to the token stream
- }
-
private void CheckNextToken()
{
if (this.previousPendingTokenType != TokenConstants.EOF)
@@ -113,10 +120,7 @@ private void CheckNextToken()
case PythonLexer.NEWLINE:
this.HandleNEWLINEtoken();
break;
- case PythonLexer.STRING:
- this.HandleSTRINGtoken();
- break;
- case PythonLexer.ERROR_TOKEN:
+ case PythonLexer.ERRORTOKEN:
this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'");
this.AddPendingToken(this.curToken);
break;
@@ -133,12 +137,12 @@ private void CheckNextToken()
private void SetCurrentAndFollowingTokens()
{
this.curToken = this.ffgToken == null ?
- new CommonToken(base.NextToken()) :
- new CommonToken(this.ffgToken);
+ base.NextToken() :
+ this.ffgToken;
this.ffgToken = this.curToken.Type == TokenConstants.EOF ?
- this.curToken :
- base.NextToken();
+ this.curToken :
+ base.NextToken();
}
// initialize the _indentLengths
@@ -196,7 +200,7 @@ private void HandleNEWLINEtoken()
}
else
{
- CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS;
if (isLookingAhead)
{
@@ -205,12 +209,12 @@ private void HandleNEWLINEtoken()
switch (this.ffgToken.Type)
{
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
this.HideAndAddPendingToken(nlToken);
if (isLookingAhead)
{
- this.AddPendingToken(this.curToken); // WS token
+ this.AddPendingToken(this.curToken); // WS token
}
break;
default:
@@ -243,7 +247,6 @@ private void HandleNEWLINEtoken()
private void InsertIndentOrDedentToken(int indentLength)
{
- //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation
int prevIndentLength = this.indentLengthStack.Peek();
if (indentLength > prevIndentLength)
{
@@ -268,25 +271,6 @@ private void InsertIndentOrDedentToken(int indentLength)
}
}
- private void HandleSTRINGtoken()
- {
- // remove the \ escape sequences from the string literal
- // https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals
- string line_joinFreeStringLiteral = Regex.Replace(this.curToken.Text, @"\\\r?\n", "");
- if (this.curToken.Text.Length == line_joinFreeStringLiteral.Length)
- {
- this.AddPendingToken(this.curToken);
- }
- else
- {
- CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token
- this.curToken.Text = line_joinFreeStringLiteral;
- this.AddPendingToken(this.curToken); // add the modified token with inline string literal
- this.HideAndAddPendingToken(originalSTRINGtoken); // add the original token with a hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
private void InsertTrailingTokens()
{
switch (this.lastPendingTokenTypeFromDefaultChannel)
@@ -311,42 +295,43 @@ private void HandleEOFtoken()
this.AddPendingToken(this.curToken);
}
- private void HideAndAddPendingToken(CommonToken cToken)
+ private void HideAndAddPendingToken(IToken tkn)
{
- cToken.Channel = TokenConstants.HiddenChannel;
- this.AddPendingToken(cToken);
+ CommonToken ctkn = new CommonToken(tkn);
+ ctkn.Channel = TokenConstants.HiddenChannel;
+ this.AddPendingToken(ctkn);
}
- private void CreateAndAddPendingToken(int type, int channel, string text, IToken baseToken)
+ private void CreateAndAddPendingToken(int ttype, int channel, string text, IToken sampleToken)
{
- CommonToken cToken = new CommonToken(baseToken);
- cToken.Type = type;
- cToken.Channel = channel;
- cToken.StopIndex = baseToken.StartIndex - 1;
+ CommonToken ctkn = new CommonToken(sampleToken);
+ ctkn.Type = ttype;
+ ctkn.Channel = channel;
+ ctkn.StopIndex = sampleToken.StartIndex - 1;
- cToken.Text = text == null
- ? "<" + Vocabulary.GetSymbolicName(type) + ">"
+ ctkn.Text = text == null
+ ? "<" + Vocabulary.GetSymbolicName(ttype) + ">"
: text;
- this.AddPendingToken(cToken);
+ this.AddPendingToken(ctkn);
}
- private void AddPendingToken(IToken token)
+ private void AddPendingToken(IToken tkn)
{
// save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.Type;
- if (token.Channel == TokenConstants.DefaultChannel)
+ this.previousPendingTokenType = tkn.Type;
+ if (tkn.Channel == TokenConstants.DefaultChannel)
{
this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
}
- this.pendingTokens.AddLast(token);
+ this.pendingTokens.AddLast(tkn);
}
- private int GetIndentationLength(string textWS) // the textWS may contain spaces, tabs or form feeds
+ private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds
{
const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces
int length = 0;
- foreach (char ch in textWS)
+ foreach (char ch in indentText)
{
switch (ch)
{
@@ -369,7 +354,7 @@ private int GetIndentationLength(string textWS) // the textWS may contain spaces
if (!this.wasIndentationMixedWithSpacesAndTabs)
{
this.wasIndentationMixedWithSpacesAndTabs = true;
- return PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
+ length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
}
}
return length;
@@ -384,13 +369,7 @@ private void ReportError(string errMsg)
{
this.ReportLexerError(errMsg);
- // the ERROR_TOKEN will raise an error in the parser
- this.CreateAndAddPendingToken(PythonLexer.ERROR_TOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
- }
-
- public override void Reset()
- {
- this.Init();
- base.Reset();
+ // the ERRORTOKEN will raise an error in the parser
+ this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
}
}
diff --git a/python/python2_7_18/Java/PythonLexerBase.java b/python/python2_7_18/Java/PythonLexerBase.java
index d09dc1171b..2abaa1769a 100644
--- a/python/python2_7_18/Java/PythonLexerBase.java
+++ b/python/python2_7_18/Java/PythonLexerBase.java
@@ -27,298 +27,285 @@ of this software and associated documentation files (the "Software"), to deal
*
*/
-import java.util.*;
-
-import org.antlr.v4.runtime.*;
-
-public abstract class PythonLexerBase extends Lexer {
- // A stack that keeps track of the indentation lengths
- private Deque indentLengthStack;
- // A list where tokens are waiting to be loaded into the token stream
- private LinkedList pendingTokens;
-
- // last pending token types
- private int previousPendingTokenType;
- private int lastPendingTokenTypeFromDefaultChannel;
-
- // The amount of opened parentheses, square brackets or curly braces
- private int opened;
-
- private boolean wasSpaceIndentation;
- private boolean wasTabIndentation;
- private boolean wasIndentationMixedWithSpacesAndTabs;
- private final int INVALID_LENGTH = -1;
-
- private CommonToken curToken; // current (under processing) token
- private Token ffgToken; // following (look ahead) token
-
- private final String ERR_TXT = " ERROR: ";
-
- protected PythonLexerBase(CharStream input) {
- super(input);
- this.init();
- }
-
- private void init() {
- this.indentLengthStack = new ArrayDeque<>();
- this.pendingTokens = new LinkedList<>();
- this.previousPendingTokenType = 0;
- this.lastPendingTokenTypeFromDefaultChannel = 0;
- this.opened = 0;
- this.wasSpaceIndentation = false;
- this.wasTabIndentation = false;
- this.wasIndentationMixedWithSpacesAndTabs = false;
- this.curToken = null;
- this.ffgToken = null;
- }
-
- @Override
- public Token nextToken() { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.pollFirst(); // add the queued token to the token stream
- }
-
- private void checkNextToken() {
- if (this.previousPendingTokenType != Token.EOF) {
- this.setCurrentAndFollowingTokens();
- if (this.indentLengthStack.isEmpty()) { // We're at the first token
- this.handleStartOfInput();
- }
-
- switch (this.curToken.getType()) {
- case PythonLexer.LPAR:
- case PythonLexer.LSQB:
- case PythonLexer.LBRACE:
- this.opened++;
- this.addPendingToken(this.curToken);
- break;
- case PythonLexer.RPAR:
- case PythonLexer.RSQB:
- case PythonLexer.RBRACE:
- this.opened--;
- this.addPendingToken(this.curToken);
- break;
- case PythonLexer.NEWLINE:
- this.handleNEWLINEtoken();
- break;
- case PythonLexer.STRING:
- this.handleSTRINGtoken();
- break;
- case PythonLexer.ERROR_TOKEN:
- this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'");
- this.addPendingToken(this.curToken);
- break;
- case Token.EOF:
- this.handleEOFtoken();
- break;
- default:
- this.addPendingToken(this.curToken);
- }
- }
- }
-
- private void setCurrentAndFollowingTokens() {
- this.curToken = this.ffgToken == null ?
- new CommonToken(super.nextToken()) :
- new CommonToken(this.ffgToken);
-
- this.ffgToken = this.curToken.getType() == Token.EOF ?
- this.curToken :
- super.nextToken();
- }
-
- // initialize the indentLengthStack
- // hide the leading NEWLINE token(s)
- // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- // insert a leading INDENT token if necessary
- private void handleStartOfInput() {
- // initialize the stack with a default 0 indentation length
- this.indentLengthStack.push(0); // this will never be popped off
- while (this.curToken.getType() != Token.EOF) {
- if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) {
- if (this.curToken.getType() == PythonLexer.NEWLINE) {
- // all the NEWLINE tokens must be ignored before the first statement
- this.hideAndAddPendingToken(this.curToken);
- } else { // We're at the first statement
- this.insertLeadingIndentToken();
- return; // continue the processing of the current token with checkNextToken()
- }
- } else {
- this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- }
- this.setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with checkNextToken()
- }
-
- private void insertLeadingIndentToken() {
- if (this.previousPendingTokenType == PythonLexer.WS) {
- Token prevToken = this.pendingTokens.peekLast(); // WS token
- if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement
- final String errMsg = "first statement indented";
- this.reportLexerError(errMsg);
- // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken);
- }
- }
- }
-
- private void handleNEWLINEtoken() {
- if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.hideAndAddPendingToken(this.curToken);
- } else {
- CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
- final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS;
- if (isLookingAhead) {
- this.setCurrentAndFollowingTokens(); // set the next two tokens
- }
-
- switch (this.ffgToken.getType()) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.hideAndAddPendingToken(nlToken);
- if (isLookingAhead) {
- this.addPendingToken(this.curToken); // WS token
- }
- break;
- default:
- this.addPendingToken(nlToken);
- if (isLookingAhead) { // We're on whitespace(s) followed by a statement
- final int indentationLength = this.ffgToken.getType() == Token.EOF ?
- 0 :
- this.getIndentationLength(this.curToken.getText());
-
- if (indentationLength != this.INVALID_LENGTH) {
- this.addPendingToken(this.curToken); // WS token
- this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
- } else {
- this.reportError("inconsistent use of tabs and spaces in indentation");
- }
- } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
- }
- }
- }
- }
-
- private void insertIndentOrDedentToken(final int indentLength) {
- int prevIndentLength = this.indentLengthStack.peek();
- if (indentLength > prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- this.indentLengthStack.push(indentLength);
- } else {
- while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
- this.indentLengthStack.pop();
- prevIndentLength = this.indentLengthStack.peek();
- if (indentLength <= prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
- } else {
- this.reportError("inconsistent dedent");
- }
- }
- }
- }
-
- private void handleSTRINGtoken() { // remove the \ escape sequences from the string literal
- final String line_joinFreeStringLiteral = this.curToken.getText().replaceAll("\\\\\\r?\\n", "");
- if (this.curToken.getText().length() == line_joinFreeStringLiteral.length()) {
- this.addPendingToken(this.curToken);
- } else {
- CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token
- this.curToken.setText(line_joinFreeStringLiteral);
- this.addPendingToken(this.curToken); // add the modified token with inline string literal
- this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
- private void insertTrailingTokens() {
- switch (this.lastPendingTokenTypeFromDefaultChannel) {
- case PythonLexer.NEWLINE:
- case PythonLexer.DEDENT:
- break; // no trailing NEWLINE token is needed
- default:
- // insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF
- }
- this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
- }
-
- private void handleEOFtoken() {
- if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
- // there was statement in the input (leading NEWLINE tokens are hidden)
- this.insertTrailingTokens();
- }
- this.addPendingToken(this.curToken);
- }
-
- private void hideAndAddPendingToken(CommonToken cToken) {
- cToken.setChannel(Token.HIDDEN_CHANNEL);
- this.addPendingToken(cToken);
- }
-
- private void createAndAddPendingToken(final int type, final int channel, final String text, Token baseToken) {
- CommonToken cToken = new CommonToken(baseToken);
- cToken.setType(type);
- cToken.setChannel(channel);
- cToken.setStopIndex(baseToken.getStartIndex() - 1);
- cToken.setText(text == null
- ? "<" + this.getVocabulary().getSymbolicName(type) + ">"
- : text);
-
- this.addPendingToken(cToken);
- }
-
- private void addPendingToken(final Token token) {
- // save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.getType();
- if (token.getChannel() == Token.DEFAULT_CHANNEL) {
- this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
- }
- this.pendingTokens.addLast(token);
- }
-
- private int getIndentationLength(final String textWS) { // the textWS may contain spaces, tabs or form feeds
- final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
- int length = 0;
- for (char ch : textWS.toCharArray()) {
- switch (ch) {
- case ' ':
- this.wasSpaceIndentation = true;
- length += 1;
- break;
- case '\t':
- this.wasTabIndentation = true;
- length += TAB_LENGTH - (length % TAB_LENGTH);
- break;
- case '\f': // form feed
- length = 0;
- break;
- }
- }
-
- if (this.wasTabIndentation && this.wasSpaceIndentation) {
- if (!(this.wasIndentationMixedWithSpacesAndTabs)) {
- this.wasIndentationMixedWithSpacesAndTabs = true;
- return this.INVALID_LENGTH; // only for the first inconsistent indent
- }
- }
- return length;
- }
-
- private void reportLexerError(final String errMsg) {
- this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null);
- }
-
- private void reportError(final String errMsg) {
- this.reportLexerError(errMsg);
-
- // the ERROR_TOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
- }
-
- @Override
- public void reset() {
- this.init();
- super.reset();
- }
-}
+ import java.util.*;
+
+ import org.antlr.v4.runtime.*;
+
+ public abstract class PythonLexerBase extends Lexer {
+ // A stack that keeps track of the indentation lengths
+ private Deque indentLengthStack;
+ // A list where tokens are waiting to be loaded into the token stream
+ private LinkedList pendingTokens;
+
+ // last pending token types
+ private int previousPendingTokenType;
+ private int lastPendingTokenTypeFromDefaultChannel;
+
+ // The amount of opened parentheses, square brackets or curly braces
+ private int opened;
+
+ private boolean wasSpaceIndentation;
+ private boolean wasTabIndentation;
+ private boolean wasIndentationMixedWithSpacesAndTabs;
+
+ private Token curToken; // current (under processing) token
+ private Token ffgToken; // following (look ahead) token
+
+ private final int INVALID_LENGTH = -1;
+ private final String ERR_TXT = " ERROR: ";
+
+ protected PythonLexerBase(CharStream input) {
+ super(input);
+ this.init();
+ }
+
+ @Override
+ public Token nextToken() { // reading the input stream until a return EOF
+ this.checkNextToken();
+ return this.pendingTokens.pollFirst(); // add the queued token to the token stream
+ }
+
+ @Override
+ public void reset() {
+ this.init();
+ super.reset();
+ }
+
+ private void init() {
+ this.indentLengthStack = new ArrayDeque<>();
+ this.pendingTokens = new LinkedList<>();
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = null;
+ this.ffgToken = null;
+ }
+
+ private void checkNextToken() {
+ if (this.previousPendingTokenType != Token.EOF) {
+ this.setCurrentAndFollowingTokens();
+ if (this.indentLengthStack.isEmpty()) { // We're at the first token
+ this.handleStartOfInput();
+ }
+
+ switch (this.curToken.getType()) {
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.addPendingToken(this.curToken);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.addPendingToken(this.curToken);
+ break;
+ case PythonLexer.NEWLINE:
+ this.handleNEWLINEtoken();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'");
+ this.addPendingToken(this.curToken);
+ break;
+ case Token.EOF:
+ this.handleEOFtoken();
+ break;
+ default:
+ this.addPendingToken(this.curToken);
+ }
+ }
+ }
+
+ private void setCurrentAndFollowingTokens() {
+ this.curToken = this.ffgToken == null ?
+ super.nextToken() :
+ this.ffgToken;
+
+ this.ffgToken = this.curToken.getType() == Token.EOF ?
+ this.curToken :
+ super.nextToken();
+ }
+
+ // initialize the indentLengthStack
+ // hide the leading NEWLINE token(s)
+ // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
+ // insert a leading INDENT token if necessary
+ private void handleStartOfInput() {
+ // initialize the stack with a default 0 indentation length
+ this.indentLengthStack.push(0); // this will never be popped off
+ while (this.curToken.getType() != Token.EOF) {
+ if (this.curToken.getChannel() == Token.DEFAULT_CHANNEL) {
+ if (this.curToken.getType() == PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.hideAndAddPendingToken(this.curToken);
+ } else { // We're at the first statement
+ this.insertLeadingIndentToken();
+ return; // continue the processing of the current token with checkNextToken()
+ }
+ } else {
+ this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.setCurrentAndFollowingTokens();
+ }
+ // continue the processing of the EOF token with checkNextToken()
+ }
+
+ private void insertLeadingIndentToken() {
+ if (this.previousPendingTokenType == PythonLexer.WS) {
+ Token prevToken = this.pendingTokens.peekLast(); // WS token
+ if (this.getIndentationLength(prevToken.getText()) != 0) { // there is an "indentation" before the first statement
+ final String errMsg = "first statement indented";
+ this.reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken);
+ }
+ }
+ }
+
+ private void handleNEWLINEtoken() {
+ if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.hideAndAddPendingToken(this.curToken);
+ } else {
+ final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS;
+ if (isLookingAhead) {
+ this.setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken.getType()) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.addPendingToken(this.curToken); // WS token
+ }
+ break;
+ default:
+ this.addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on whitespace(s) followed by a statement
+ final int indentationLength = this.ffgToken.getType() == Token.EOF ?
+ 0 :
+ this.getIndentationLength(this.curToken.getText());
+
+ if (indentationLength != this.INVALID_LENGTH) {
+ this.addPendingToken(this.curToken); // WS token
+ this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+ }
+
+ private void insertIndentOrDedentToken(final int indentLength) {
+ int prevIndentLength = this.indentLengthStack.peek();
+ if (indentLength > prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ this.indentLengthStack.push(indentLength);
+ } else {
+ while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.indentLengthStack.pop();
+ prevIndentLength = this.indentLengthStack.peek();
+ if (indentLength <= prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ } else {
+ this.reportError("inconsistent dedent");
+ }
+ }
+ }
+ }
+
+ private void insertTrailingTokens() {
+ switch (this.lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // ffgToken is EOF
+ }
+ this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ private void handleEOFtoken() {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was statement in the input (leading NEWLINE tokens are hidden)
+ this.insertTrailingTokens();
+ }
+ this.addPendingToken(this.curToken);
+ }
+
+ private void hideAndAddPendingToken(final Token tkn) {
+ CommonToken ctkn = new CommonToken(tkn);
+ ctkn.setChannel(Token.HIDDEN_CHANNEL);
+ this.addPendingToken(ctkn);
+ }
+
+ private void createAndAddPendingToken(final int ttype, final int channel, final String text, Token sampleToken) {
+ CommonToken ctkn = new CommonToken(sampleToken);
+ ctkn.setType(ttype);
+ ctkn.setChannel(channel);
+ ctkn.setStopIndex(sampleToken.getStartIndex() - 1);
+ ctkn.setText(text == null
+ ? "<" + this.getVocabulary().getDisplayName(ttype) + ">"
+ : text);
+
+ this.addPendingToken(ctkn);
+ }
+
+ private void addPendingToken(final Token tkn) {
+ // save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
+ this.previousPendingTokenType = tkn.getType();
+ if (tkn.getChannel() == Token.DEFAULT_CHANNEL) {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.addLast(tkn);
+ }
+
+ private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds
+ final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
+ int length = 0;
+ for (char ch : indentText.toCharArray()) {
+ switch (ch) {
+ case ' ':
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case '\t':
+ this.wasTabIndentation = true;
+ length += TAB_LENGTH - (length % TAB_LENGTH);
+ break;
+ case '\f': // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation) {
+ if (!(this.wasIndentationMixedWithSpacesAndTabs)) {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = this.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private void reportLexerError(final String errMsg) {
+ this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.getLine(), this.curToken.getCharPositionInLine(), " LEXER" + this.ERR_TXT + errMsg, null);
+ }
+
+ private void reportError(final String errMsg) {
+ this.reportLexerError(errMsg);
+
+ // the ERRORTOKEN will raise an error in the parser
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
+ }
+ }
+
\ No newline at end of file
diff --git a/python/python2_7_18/JavaScript/PythonLexerBase.js b/python/python2_7_18/JavaScript/PythonLexerBase.js
index 5b1d8687ef..82b8e59536 100644
--- a/python/python2_7_18/JavaScript/PythonLexerBase.js
+++ b/python/python2_7_18/JavaScript/PythonLexerBase.js
@@ -27,7 +27,7 @@ THE SOFTWARE.
*
*/
-import { Token, CommonToken, Lexer } from "antlr4";
+import { Token, Lexer } from "antlr4";
import PythonLexer from "./PythonLexer.js";
export default class PythonLexerBase extends Lexer {
@@ -49,17 +49,27 @@ export default class PythonLexerBase extends Lexer {
this.wasSpaceIndentation;
this.wasTabIndentation;
this.wasIndentationMixedWithSpacesAndTabs;
- const INVALID_LENGTH = -1;
-
+
this.curToken; // current (under processing) token
this.ffgToken; // following (look ahead) token
- const ERR_TXT = " ERROR: ";
+ this.#init();
+ }
+
+ get #INVALID_LENGTH() { return -1; }
+ get #ERR_TXT() { return " ERROR: "; }
- this.init();
+ nextToken() { // reading the input stream until a return EOF
+ this.#checkNextToken();
+ return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream
}
- init() {
+ reset() {
+ this.#init();
+ super.reset();
+ }
+
+ #init() {
this.indentLengthStack = [];
this.pendingTokens = [];
this.previousPendingTokenType = 0;
@@ -72,16 +82,11 @@ export default class PythonLexerBase extends Lexer {
this.ffgToken = null;
}
- nextToken() { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream
- }
-
- checkNextToken() {
+ #checkNextToken() {
if (this.previousPendingTokenType !== Token.EOF) {
- this.setCurrentAndFollowingTokens();
+ this.#setCurrentAndFollowingTokens();
if (this.indentLengthStack.length === 0) { // We're at the first token
- this.handleStartOfInput();
+ this.#handleStartOfInput();
}
switch (this.curToken.type) {
@@ -89,207 +94,181 @@ export default class PythonLexerBase extends Lexer {
case PythonLexer.LSQB:
case PythonLexer.LBRACE:
this.opened++;
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
break;
case PythonLexer.RPAR:
case PythonLexer.RSQB:
case PythonLexer.RBRACE:
this.opened--;
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
break;
case PythonLexer.NEWLINE:
- this.handleNEWLINEtoken();
+ this.#handleNEWLINEtoken();
break;
- case PythonLexer.STRING:
- this.handleSTRINGtoken();
- break;
- case PythonLexer.ERROR_TOKEN:
- this.reportLexerError(`token recognition error at: '${this.curToken.text}'`);
- this.addPendingToken(this.curToken);
+ case PythonLexer.ERRORTOKEN:
+ this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`);
+ this.#addPendingToken(this.curToken);
break;
case Token.EOF:
- this.handleEOFtoken();
+ this.#handleEOFtoken();
break;
default:
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
}
}
}
- setCurrentAndFollowingTokens() {
+ #setCurrentAndFollowingTokens() {
this.curToken = this.ffgToken == undefined ?
- this.getCommonTokenByToken(super.nextToken()) :
- this.getCommonTokenByToken(this.ffgToken);
+ super.nextToken() :
+ this.ffgToken;
this.ffgToken = this.curToken.type === Token.EOF ?
this.curToken :
- this.getCommonTokenByToken(super.nextToken());
+ super.nextToken();
}
// initialize the _indentLengthStack
// hide the leading NEWLINE token(s)
// if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
// insert a leading INDENT token if necessary
- handleStartOfInput() {
+ #handleStartOfInput() {
// initialize the stack with a default 0 indentation length
this.indentLengthStack.push(0); // this will never be popped off
while (this.curToken.type !== Token.EOF) {
if (this.curToken.channel === Token.DEFAULT_CHANNEL) {
if (this.curToken.type === PythonLexer.NEWLINE) {
// all the NEWLINE tokens must be ignored before the first statement
- this.hideAndAddPendingToken(this.curToken);
+ this.#hideAndAddPendingToken(this.curToken);
} else { // We're at the first statement
- this.insertLeadingIndentToken();
- return; // continue the processing of the current token with checkNextToken()
+ this.#insertLeadingIndentToken();
+ return; // continue the processing of the current token with #checkNextToken()
}
} else {
- this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
}
- this.setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with checkNextToken()
+ this.#setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with #checkNextToken()
}
- insertLeadingIndentToken() {
+ #insertLeadingIndentToken() {
if (this.previousPendingTokenType === PythonLexer.WS) {
let prevToken = this.pendingTokens.at(- 1) /* .peekLast() */; // WS token
- if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
const errMsg = "first statement indented";
- this.reportLexerError(errMsg);
+ this.#reportLexerError(errMsg);
// insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken);
+ this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken);
}
}
}
- handleNEWLINEtoken() {
+ #handleNEWLINEtoken() {
if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.hideAndAddPendingToken(this.curToken);
+ this.#hideAndAddPendingToken(this.curToken);
} else {
- let nlToken = this.getCommonTokenByToken(this.curToken); // save the current NEWLINE token
+ let nlToken = this.curToken.clone(); // save the current NEWLINE token
const isLookingAhead = this.ffgToken.type === PythonLexer.WS;
if (isLookingAhead) {
- this.setCurrentAndFollowingTokens(); // set the next two tokens
+ this.#setCurrentAndFollowingTokens(); // set the next two tokens
}
switch (this.ffgToken.type) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- this.hideAndAddPendingToken(nlToken);
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.#hideAndAddPendingToken(nlToken);
if (isLookingAhead) {
- this.addPendingToken(this.curToken); // WS token
+ this.#addPendingToken(this.curToken); // WS token
}
break;
default:
- this.addPendingToken(nlToken);
+ this.#addPendingToken(nlToken);
if (isLookingAhead) { // We're on whitespace(s) followed by a statement
const indentationLength = this.ffgToken.type === Token.EOF ?
- 0 :
- this.getIndentationLength(this.curToken.text);
+ 0 :
+ this.#getIndentationLength(this.curToken.text);
- if (indentationLength !== this.INVALID_LENGTH) {
- this.addPendingToken(this.curToken); // WS token
- this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ if (indentationLength !== this.#INVALID_LENGTH) {
+ this.#addPendingToken(this.curToken); // WS token
+ this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
} else {
- this.reportError("inconsistent use of tabs and spaces in indentation");
+ this.#reportError("inconsistent use of tabs and spaces in indentation");
}
} else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s)
}
}
}
}
- insertIndentOrDedentToken(curIndentLength) {
+ #insertIndentOrDedentToken(curIndentLength) {
let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
if (curIndentLength > prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
this.indentLengthStack.push(curIndentLength);
} else {
while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
this.indentLengthStack.pop();
prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
if (curIndentLength <= prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
} else {
- this.reportError("inconsistent dedent");
+ this.#reportError("inconsistent dedent");
}
}
}
}
- handleSTRINGtoken() { // remove the \ escape sequences from the string literal
- const line_joinFreeStringLiteral = this.curToken.text.replace(/\\(\r?\n)/g, "");
- if (this.curToken.text.length === line_joinFreeStringLiteral.length) {
- this.addPendingToken(this.curToken);
- } else {
- let originalSTRINGtoken = this.getCommonTokenByToken(this.curToken); // backup the original token
- this.curToken.text = line_joinFreeStringLiteral;
- this.addPendingToken(this.curToken); // add the modified token with inline string literal
- this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
- insertTrailingTokens() {
+ #insertTrailingTokens() {
switch (this.lastPendingTokenTypeFromDefaultChannel) {
case PythonLexer.NEWLINE:
case PythonLexer.DEDENT:
break; // no trailing NEWLINE token is needed
default:
// insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF
+ this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF
}
- this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
}
- handleEOFtoken() {
+ #handleEOFtoken() {
if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
// there was a statement in the input (leading NEWLINE tokens are hidden)
- this.insertTrailingTokens();
+ this.#insertTrailingTokens();
}
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
}
- hideAndAddPendingToken(cToken) {
- cToken.channel = Token.HIDDEN_CHANNEL;
- this.addPendingToken(cToken);
+ #hideAndAddPendingToken(ctkn) {
+ ctkn.channel = Token.HIDDEN_CHANNEL;
+ this.#addPendingToken(ctkn);
}
- createAndAddPendingToken(type, channel, text, baseToken) {
- const cToken = this.getCommonTokenByToken(baseToken);
- cToken.type = type;
- cToken.channel = channel;
- cToken.stop = baseToken.start - 1;
- cToken.text = text == null ?
+ #createAndAddPendingToken(type, channel, text, sampleToken) {
+ const ctkn = sampleToken.clone();
+ ctkn.type = type;
+ ctkn.channel = channel;
+ ctkn.stop = sampleToken.start - 1;
+ ctkn.text = text == null ?
`<${this.getSymbolicNames()[type]}>` :
text;
- this.addPendingToken(cToken);
+ this.#addPendingToken(ctkn);
}
- addPendingToken(token) {
+ #addPendingToken(tkn) {
// save the last pending token type because the _pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.type;
- if (token.channel === Token.DEFAULT_CHANNEL) {
+ this.previousPendingTokenType = tkn.type;
+ if (tkn.channel === Token.DEFAULT_CHANNEL) {
this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
}
- this.pendingTokens.push(token) /* .addLast(token) */;
- }
-
- getCommonTokenByToken(oldToken) {
- let commonToken = new CommonToken(oldToken.source, oldToken.type, oldToken.channel, oldToken.start, oldToken.stop);
- commonToken.tokenIndex = oldToken.tokenIndex;
- commonToken.line = oldToken.line;
- commonToken.column = oldToken.column;
- commonToken.text = oldToken.text;
- return commonToken;
+ this.pendingTokens.push(tkn) /* .addLast(token) */;
}
- getIndentationLength(textWS) { // the textWS may contain spaces, tabs or form feeds
+ #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds
const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
let length = 0;
-
- for (let ch of textWS) {
+ for (let ch of indentText) {
switch (ch) {
case " ":
this.wasSpaceIndentation = true;
@@ -308,25 +287,20 @@ export default class PythonLexerBase extends Lexer {
if (this.wasTabIndentation && this.wasSpaceIndentation) {
if (!this.wasIndentationMixedWithSpacesAndTabs) {
this.wasIndentationMixedWithSpacesAndTabs = true;
- return this.INVALID_LENGTH; // only for the first inconsistent indent
+ length = this.#INVALID_LENGTH; // only for the first inconsistent indent
}
}
return length;
}
- reportLexerError(errMsg) {
- this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.ERR_TXT + errMsg, null);
+ #reportLexerError(errMsg) {
+ this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null);
}
- reportError(errMsg) {
- this.reportLexerError(errMsg);
-
- // the ERROR_TOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
- }
+ #reportError(errMsg) {
+ this.#reportLexerError(errMsg);
- reset() {
- this.init();
- super.reset();
+ // the ERRORTOKEN will raise an error in the parser
+ this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken);
}
}
diff --git a/python/python2_7_18/Python3/PythonLexerBase.py b/python/python2_7_18/Python3/PythonLexerBase.py
index dc262833cc..3abe736fdb 100644
--- a/python/python2_7_18/Python3/PythonLexerBase.py
+++ b/python/python2_7_18/Python3/PythonLexerBase.py
@@ -21,7 +21,6 @@
#
# Developed by : Robert Einhorn
-from collections import deque
from typing import TextIO
from antlr4 import InputStream, Lexer, Token
from antlr4.Token import CommonToken
@@ -33,229 +32,214 @@ def __init__(self, input: InputStream, output: TextIO = sys.stdout):
super().__init__(input, output)
# A stack that keeps track of the indentation lengths
- self.indent_length_stack: Deque[int]
+ self.__indent_length_stack: list[int]
# A list where tokens are waiting to be loaded into the token stream
- self.pending_tokens: list[CommonToken]
+ self.__pending_tokens: list[CommonToken]
# last pending token types
- self.previous_pending_token_type: int
- self.last_pending_token_type_from_default_channel: int
+ self.__previous_pending_token_type: int
+ self.__last_pending_token_type_from_default_channel: int
# The amount of opened parentheses, square brackets or curly braces
- self.opened: int
-
- self.was_space_indentation: bool
- self.was_tab_indentation: bool
- self.was_indentation_mixed_with_spaces_and_tabs: bool
- self.INVALID_LENGTH: int
-
- self.cur_token: CommonToken # current (under processing) token
- self.ffg_token: CommonToken # following (look ahead) token
-
- self.ERR_TXT: str
-
- self.init()
-
- def init(self):
- self.indent_length_stack = deque()
- self.pending_tokens = []
- self.previous_pending_token_type = 0
- self.last_pending_token_type_from_default_channel = 0
- self.opened = 0
- self.was_space_indentation = False
- self.was_tab_indentation = False
- self.was_indentation_mixed_with_spaces_and_tabs = False
- self.INVALID_LENGTH = -1
- self.cur_token = None
- self.ffg_token = None
- self.ERR_TXT = " ERROR: "
+ self.__opened: int
+
+ self.__was_space_indentation: bool
+ self.__was_tab_indentation: bool
+ self.__was_indentation_mixed_with_spaces_and_tabs: bool
+
+ self.__cur_token: CommonToken # current (under processing) token
+ self.__ffg_token: CommonToken # following (look ahead) token
+
+ self.__INVALID_LENGTH: int = -1
+ self.__ERR_TXT: str = " ERROR: "
+
+ self.__init()
def nextToken(self) -> CommonToken: # reading the input stream until a return EOF
- self.check_next_token()
- return self.pending_tokens.pop(0) # add the queued token to the token stream
-
- def check_next_token(self):
- if self.previous_pending_token_type != Token.EOF:
- self.set_current_and_following_tokens()
- if len(self.indent_length_stack) == 0: # We're at the first token
- self.handle_start_of_input()
- match self.cur_token.type:
+ self.__check_next_token()
+ return self.__pending_tokens.pop(0) # add the queued token to the token stream
+
+ def reset(self) -> None:
+ self.__init()
+ super().reset()
+
+ def __init(self) -> None:
+ self.__indent_length_stack = []
+ self.__pending_tokens = []
+ self.__previous_pending_token_type = 0
+ self.__last_pending_token_type_from_default_channel = 0
+ self.__opened = 0
+ self.__was_space_indentation = False
+ self.__was_tab_indentation = False
+ self.__was_indentation_mixed_with_spaces_and_tabs = False
+ self.__cur_token = None
+ self.__ffg_token = None
+
+ def __check_next_token(self) -> None:
+ if self.__previous_pending_token_type != Token.EOF:
+ self.__set_current_and_following_tokens()
+ if len(self.__indent_length_stack) == 0: # We're at the first token
+ self.__handle_start_of_input()
+
+ match self.__cur_token.type:
case self.LPAR | self.LSQB | self.LBRACE:
- self.opened += 1
- self.add_pending_token(self.cur_token)
+ self.__opened += 1
+ self.__add_pending_token(self.__cur_token)
case self.RPAR | self.RSQB | self.RBRACE:
- self.opened -= 1
- self.add_pending_token(self.cur_token)
+ self.__opened -= 1
+ self.__add_pending_token(self.__cur_token)
case self.NEWLINE:
- self.handle_NEWLINE_token()
- case self.STRING:
- self.handle_STRING_token()
- case self.ERROR_TOKEN:
- self.report_lexer_error("token recognition error at: '" + self.cur_token.text + "'")
- self.add_pending_token(self.cur_token)
+ self.__handle_NEWLINE_token()
+ case self.ERRORTOKEN:
+ self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'")
+ self.__add_pending_token(self.__cur_token)
case Token.EOF:
- self.handle_EOF_token()
+ self.__handle_EOF_token()
case other:
- self.add_pending_token(self.cur_token)
+ self.__add_pending_token(self.__cur_token)
- def set_current_and_following_tokens(self):
- self.cur_token = super().nextToken() if self.ffg_token is None else \
- self.ffg_token
+ def __set_current_and_following_tokens(self) -> None:
+ self.__cur_token = super().nextToken() if self.__ffg_token is None else \
+ self.__ffg_token
- self.ffg_token = self.cur_token if self.cur_token.type == Token.EOF else \
- super().nextToken()
+ self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \
+ super().nextToken()
# initialize the _indent_length_stack
# hide the leading NEWLINE token(s)
# if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
# insert a leading INDENT token if necessary
- def handle_start_of_input(self):
+ def __handle_start_of_input(self) -> None:
# initialize the stack with a default 0 indentation length
- self.indent_length_stack.append(0) # this will never be popped off
- while self.cur_token.type != Token.EOF:
- if self.cur_token.channel == Token.DEFAULT_CHANNEL:
- if self.cur_token.type == self.NEWLINE:
+ self.__indent_length_stack.append(0) # this will never be popped off
+ while self.__cur_token.type != Token.EOF:
+ if self.__cur_token.channel == Token.DEFAULT_CHANNEL:
+ if self.__cur_token.type == self.NEWLINE:
# all the NEWLINE tokens must be ignored before the first statement
- self.hide_and_add_pending_token(self.cur_token)
+ self.__hide_and_add_pending_token(self.__cur_token)
else: # We're at the first statement
- self.insert_leading_indent_token()
- return # continue the processing of the current token with check_next_token()
+ self.__insert_leading_indent_token()
+ return # continue the processing of the current token with __check_next_token()
else:
- self.add_pending_token(self.cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- self.set_current_and_following_tokens()
- # continue the processing of the EOF token with check_next_token()
-
- def insert_leading_indent_token(self):
- if self.previous_pending_token_type == self.WS:
- prev_token: CommonToken = self.pending_tokens[-1] # WS token
- if self.get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
+ self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ self.__set_current_and_following_tokens()
+ # continue the processing of the EOF token with __check_next_token()
+
+ def __insert_leading_indent_token(self) -> None:
+ if self.__previous_pending_token_type == self.WS:
+ prev_token: CommonToken = self.__pending_tokens[-1] # WS token
+ if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
err_msg: str = "first statement indented"
- self.report_lexer_error(err_msg)
+ self.__report_lexer_error(err_msg)
# insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.cur_token)
+ self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token)
- def handle_NEWLINE_token(self):
- if self.opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
- self.hide_and_add_pending_token(self.cur_token)
+ def __handle_NEWLINE_token(self) -> None:
+ if self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
+ self.__hide_and_add_pending_token(self.__cur_token)
else:
- nl_token: CommonToken = self.cur_token # save the current NEWLINE token
- is_looking_ahead: bool = self.ffg_token.type == self.WS
+ nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token
+ is_looking_ahead: bool = self.__ffg_token.type == self.WS
if is_looking_ahead:
- self.set_current_and_following_tokens() # set the next two tokens
+ self.__set_current_and_following_tokens() # set the next two tokens
- match self.ffg_token.type:
+ match self.__ffg_token.type:
case self.NEWLINE | self.COMMENT:
- # We're before a blank line or a comment or a type comment
- self.hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
+ # We're before a blank line or a comment or type comment or a type ignore comment
+ self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
if is_looking_ahead:
- self.add_pending_token(self.cur_token) # WS token
+ self.__add_pending_token(self.__cur_token) # WS token
case other:
- self.add_pending_token(nl_token)
+ self.__add_pending_token(nl_token)
if is_looking_ahead: # We're on a whitespace(s) followed by a statement
- indentation_length: int = 0 if self.ffg_token.type == Token.EOF else \
- self.get_indentation_length(self.cur_token.text)
+ indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \
+ self.__get_indentation_length(self.__cur_token.text)
- if indentation_length != self.INVALID_LENGTH:
- self.add_pending_token(self.cur_token) # WS token
- self.insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s)
+ if indentation_length != self.__INVALID_LENGTH:
+ self.__add_pending_token(self.__cur_token) # WS token
+ self.__insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s)
else:
- self.report_error("inconsistent use of tabs and spaces in indentation")
+ self.__report_error("inconsistent use of tabs and spaces in indentation")
else: # We're at a newline followed by a statement (there is no whitespace before the statement)
- self.insert_indent_or_dedent_token(0) # may insert DEDENT token(s)
+ self.__insert_indent_or_dedent_token(0) # may insert DEDENT token(s)
- def insert_indent_or_dedent_token(self, indent_length: int):
- prev_indent_length: int = self.indent_length_stack[-1] # peek()
+ def __insert_indent_or_dedent_token(self, indent_length: int) -> None:
+ prev_indent_length: int = self.__indent_length_stack[-1] # peek()
if indent_length > prev_indent_length:
- self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token)
- self.indent_length_stack.append(indent_length)
+ self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
+ self.__indent_length_stack.append(indent_length)
else:
while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream
- self.indent_length_stack.pop()
- prev_indent_length = self.indent_length_stack[-1] # peek()
+ self.__indent_length_stack.pop()
+ prev_indent_length = self.__indent_length_stack[-1] # peek()
if indent_length <= prev_indent_length:
- self.create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token)
+ self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
else:
- self.report_error("inconsistent dedent")
+ self.__report_error("inconsistent dedent")
- def handle_STRING_token(self): # remove the \ escape sequences from the string literal
- # https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals
- line_joinFreeStringLiteral: str = re.sub(r"\\\r?\n", "", self.cur_token.text)
- if len(self.cur_token.text) == len(line_joinFreeStringLiteral):
- self.add_pending_token(self.cur_token)
- else:
- originalSTRINGtoken: CommonToken = self.cur_token.clone() # backup the original token
- self.cur_token.text = line_joinFreeStringLiteral
- self.add_pending_token(self.cur_token) # add the modified token with inline string literal
- self.hide_and_add_pending_token(originalSTRINGtoken) # add the original token to the hidden channel
- # this inserted hidden token allows to restore the original string literal with the \ escape sequences
-
- def insert_trailing_tokens(self):
- match self.last_pending_token_type_from_default_channel:
+ def __insert_trailing_tokens(self) -> None:
+ match self.__last_pending_token_type_from_default_channel:
case self.NEWLINE | self.DEDENT:
pass # no trailing NEWLINE token is needed
case other:
# insert an extra trailing NEWLINE token that serves as the end of the last statement
- self.create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.ffg_token) # _ffg_token is EOF
- self.insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed
+ self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF
+ self.__insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed
- def handle_EOF_token(self):
- if self.last_pending_token_type_from_default_channel > 0:
+ def __handle_EOF_token(self) -> None:
+ if self.__last_pending_token_type_from_default_channel > 0:
# there was statement in the input (leading NEWLINE tokens are hidden)
- self.insert_trailing_tokens()
- self.add_pending_token(self.cur_token)
+ self.__insert_trailing_tokens()
+ self.__add_pending_token(self.__cur_token)
- def hide_and_add_pending_token(self, cToken: CommonToken):
- cToken.channel = Token.HIDDEN_CHANNEL
- self.add_pending_token(cToken)
+ def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None:
+ ctkn.channel = Token.HIDDEN_CHANNEL
+ self.__add_pending_token(ctkn)
- def create_and_add_pending_token(self, type: int, channel: int, text: str, base_token: CommonToken):
- cToken: CommonToken = base_token.clone()
- cToken.type = type
- cToken.channel = channel
- cToken.stop = base_token.start - 1
- cToken.text = "<" + self.symbolicNames[type] + ">" if text is None else \
- text
+ def __create_and_add_pending_token(self, ttype: int, channel: int, text: str, sample_token: CommonToken) -> None:
+ ctkn: CommonToken = sample_token.clone()
+ ctkn.type = ttype
+ ctkn.channel = channel
+ ctkn.stop = sample_token.start - 1
+ ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \
+ text
- self.add_pending_token(cToken)
+ self.__add_pending_token(ctkn)
- def add_pending_token(self, token: CommonToken):
+ def __add_pending_token(self, ctkn: CommonToken) -> None:
# save the last pending token type because the _pending_tokens list can be empty by the nextToken()
- self.previous_pending_token_type = token.type
- if token.channel == Token.DEFAULT_CHANNEL:
- self.last_pending_token_type_from_default_channel = self.previous_pending_token_type
- self.pending_tokens.append(token)
+ self.__previous_pending_token_type = ctkn.type
+ if ctkn.channel == Token.DEFAULT_CHANNEL:
+ self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type
+ self.__pending_tokens.append(ctkn)
- def get_indentation_length(self, textWS: str) -> int: # the textWS may contain spaces, tabs or form feeds
+ def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds
TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab to spaces
length: int = 0
ch: str
- for ch in textWS:
+ for ch in indentText:
match ch:
case ' ':
- self.was_space_indentation = True
+ self.__was_space_indentation = True
length += 1
case '\t':
- self.was_tab_indentation = True
+ self.__was_tab_indentation = True
length += TAB_LENGTH - (length % TAB_LENGTH)
case '\f': # form feed
length = 0
- if self.was_tab_indentation and self.was_space_indentation:
- if not self.was_indentation_mixed_with_spaces_and_tabs:
- self.was_indentation_mixed_with_spaces_and_tabs = True
- return self.INVALID_LENGTH # only for the first inconsistent indent
+ if self.__was_tab_indentation and self.__was_space_indentation:
+ if not self.__was_indentation_mixed_with_spaces_and_tabs:
+ self.__was_indentation_mixed_with_spaces_and_tabs = True
+ length = self.__INVALID_LENGTH # only for the first inconsistent indent
return length
- def report_lexer_error(self, err_msg):
- self.getErrorListenerDispatch().syntaxError(self, self.cur_token, self.cur_token.line, self.cur_token.column, " LEXER" + self.ERR_TXT + err_msg, None)
+ def __report_lexer_error(self, err_msg: str) -> None:
+ self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None)
- def report_error(self, err_msg):
- self.report_lexer_error(err_msg)
+ def __report_error(self, err_msg: str) -> None:
+ self.__report_lexer_error(err_msg)
- # the ERROR_TOKEN will raise an error in the parser
- self.create_and_add_pending_token(self.ERROR_TOKEN, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.ffg_token)
-
- def reset(self):
- self.init()
- super().reset()
+ # the ERRORTOKEN will raise an error in the parser
+ self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token)
diff --git a/python/python2_7_18/PythonLexer.g4 b/python/python2_7_18/PythonLexer.g4
index 6a045a38a6..5146572012 100644
--- a/python/python2_7_18/PythonLexer.g4
+++ b/python/python2_7_18/PythonLexer.g4
@@ -131,7 +131,7 @@ NUMBER
STRING : STRING_LITERAL;
// https://docs.python.org/2.7/reference/lexical_analysis.html#physical-lines
-NEWLINE : OS_INDEPENDENT_NL;
+NEWLINE : '\r'? '\n'; // Unix, Windows
// https://docs.python.org/2.7/reference/lexical_analysis.html#comments
COMMENT : '#' ~[\r\n]* -> channel(HIDDEN);
@@ -142,7 +142,7 @@ WS : [ \t\f]+ -> channel(HIDDEN);
// https://docs.python.org/2.7/reference/lexical_analysis.html#explicit-line-joining
EXPLICIT_LINE_JOINING : '\\' NEWLINE -> channel(HIDDEN);
-ERROR_TOKEN : . ; // catch unrecognized characters and redirect these errors to the parser
+ERRORTOKEN : . ; // catch unrecognized characters and redirect these errors to the parser
/*
@@ -173,10 +173,10 @@ fragment LONG_STRING_ITEM : LONG_STRING_CHAR | ESCAPE_SEQ;
fragment SHORT_STRING_CHAR_NO_SINGLE_QUOTE : ~[\\\r\n']; //
fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"]; //
fragment LONG_STRING_CHAR : ~'\\'; //
-fragment ESCAPE_SEQ
- : '\\' OS_INDEPENDENT_NL // \ escape sequence
- | '\\' [\u0000-\u007F] // "\"
- ; // the \ (not \n) escape sequences will be removed from the string literals by the PythonLexerBase class
+fragment ESCAPE_SEQ // https://docs.python.org/2.7/reference/lexical_analysis.html#string-literals
+ : '\\' '\r' '\n' // for the two-character Windows line break: \ escape sequence (string literal line continuation)
+ | '\\' [\u0000-\u007F] // "\"
+ ;
// https://docs.python.org/2.7/reference/lexical_analysis.html#integer-and-long-integer-literals
fragment LONG_INTEGER : INTEGER ('l' | 'L');
@@ -201,9 +201,6 @@ fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT+;
// https://docs.python.org/2.7/reference/lexical_analysis.html#imaginary-literals
fragment IMAG_NUMBER : (FLOAT_NUMBER | INT_PART) ('j' | 'J');
-// https://docs.python.org/2.7/reference/lexical_analysis.html#physical-lines
-fragment OS_INDEPENDENT_NL : '\r'? '\n'; // Unix, Windows
-
// https://docs.python.org/2.7/reference/lexical_analysis.html#identifiers
fragment IDENTIFIER : (LETTER | '_') (LETTER | DIGIT | '_')*;
fragment LETTER : LOWERCASE | UPPERCASE;
diff --git a/python/python2_7_18/TypeScript/PythonLexerBase.ts b/python/python2_7_18/TypeScript/PythonLexerBase.ts
new file mode 100644
index 0000000000..0c8ad608b1
--- /dev/null
+++ b/python/python2_7_18/TypeScript/PythonLexerBase.ts
@@ -0,0 +1,306 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ *
+ * Project : Python Indent/Dedent handler for ANTLR4 grammars
+ *
+ * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
+ *
+ */
+
+import { CharStream, Token, Lexer } from "antlr4";
+import PythonLexer from "./PythonLexer";
+import * as Collections from "typescript-collections";
+
+export default abstract class PythonLexerBase extends Lexer {
+ // A stack that keeps track of the indentation lengths
+ private indentLengthStack!: Collections.Stack;
+ // A list where tokens are waiting to be loaded into the token stream
+ private pendingTokens!: Array;
+
+ // last pending token types
+ private previousPendingTokenType!: number;
+ private lastPendingTokenTypeFromDefaultChannel!: number;
+
+ // The amount of opened parentheses, square brackets or curly braces
+ private opened!: number;
+
+ private wasSpaceIndentation!: boolean;
+ private wasTabIndentation!: boolean;
+ private wasIndentationMixedWithSpacesAndTabs!: boolean;
+
+ private curToken: Token | undefined; // current (under processing) token
+ private ffgToken: Token | undefined; // following (look ahead) token
+
+ private readonly INVALID_LENGTH: number = -1;
+ private readonly ERR_TXT: string = " ERROR: ";
+
+ protected constructor(input: CharStream) {
+ super(input);
+ this.init();
+ }
+
+ public nextToken(): Token { // reading the input stream until a return EOF
+ this.checkNextToken();
+ return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream
+ }
+
+ public reset(): void {
+ this.init();
+ super.reset();
+ }
+
+ private init(): void {
+ this.indentLengthStack = new Collections.Stack();
+ this.pendingTokens = [];
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = undefined;
+ this.ffgToken = undefined;
+ }
+
+ private checkNextToken(): void {
+ if (this.previousPendingTokenType !== PythonLexer.EOF) {
+ this.setCurrentAndFollowingTokens();
+ if (this.indentLengthStack.isEmpty()) { // We're at the first token
+ this.handleStartOfInput();
+ }
+
+ switch (this.curToken!.type) {
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.NEWLINE:
+ this.handleNEWLINEtoken();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`);
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.EOF:
+ this.handleEOFtoken();
+ break;
+ default:
+ this.addPendingToken(this.curToken!);
+ }
+ }
+ }
+
+ private setCurrentAndFollowingTokens(): void {
+ this.curToken = this.ffgToken == undefined
+ ? super.nextToken()
+ : this.ffgToken;
+
+ this.ffgToken = this.curToken.type === PythonLexer.EOF
+ ? this.curToken
+ : super.nextToken();
+ }
+
+ // initialize the indentLengthStack
+ // hide the leading NEWLINE token(s)
+ // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
+ // insert a leading INDENT token if necessary
+ private handleStartOfInput(): void {
+ // initialize the stack with a default 0 indentation length
+ this.indentLengthStack.push(0); // this will never be popped off
+ while (this.curToken!.type !== PythonLexer.EOF) {
+ if (this.curToken!.channel === Token.DEFAULT_CHANNEL) {
+ if (this.curToken!.type === PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.hideAndAddPendingToken(this.curToken!);
+ } else { // We're at the first statement
+ this.insertLeadingIndentToken();
+ return; // continue the processing of the current token with checkNextToken()
+ }
+ } else {
+ this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with checkNextToken()
+ }
+
+ private insertLeadingIndentToken(): void {
+ if (this.previousPendingTokenType === PythonLexer.WS) {
+ const prevToken: Token = this.pendingTokens[this.pendingTokens.length - 1] /* .peekLast() */; // WS token
+ if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ const errMsg: string = "first statement indented";
+ this.reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!);
+ }
+ }
+ }
+
+ private handleNEWLINEtoken(): void {
+ if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.hideAndAddPendingToken(this.curToken!);
+ } else {
+ const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token
+ const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS;
+ if (isLookingAhead) {
+ this.setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken!.type) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.addPendingToken(this.curToken!); // WS token
+ }
+ break;
+ default:
+ this.addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on whitespace(s) followed by a statement
+ const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ?
+ 0 :
+ this.getIndentationLength(this.curToken!.text);
+
+ if (indentationLength !== this.INVALID_LENGTH) {
+ this.addPendingToken(this.curToken!); // WS token
+ this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+ }
+
+ private insertIndentOrDedentToken(indentLength: number): void {
+ let prevIndentLength: number = this.indentLengthStack.peek()!;
+ if (indentLength > prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
+ this.indentLengthStack.push(indentLength);
+ } else {
+ while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.indentLengthStack.pop();
+ prevIndentLength = this.indentLengthStack.peek()!;
+ if (indentLength <= prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
+ } else {
+ this.reportError("inconsistent dedent");
+ }
+ }
+ }
+ }
+
+ private insertTrailingTokens(): void {
+ switch (this.lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF
+ }
+ this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ private handleEOFtoken(): void {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was a statement in the input (leading NEWLINE tokens are hidden)
+ this.insertTrailingTokens();
+ }
+ this.addPendingToken(this.curToken!);
+ }
+
+ private hideAndAddPendingToken(tkn: Token): void {
+ tkn.channel = Token.HIDDEN_CHANNEL;
+ this.addPendingToken(tkn);
+ }
+
+ private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void {
+ const tkn: Token = sampleToken.clone();
+ tkn.type = type;
+ tkn.channel = channel;
+ tkn.stop = sampleToken.start - 1;
+ tkn.text = text == null ?
+ `<${this.getSymbolicNames()[type]}>` :
+ text;
+
+ this.addPendingToken(tkn);
+ }
+
+ private addPendingToken(tkn: Token): void {
+ // save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
+ this.previousPendingTokenType = tkn.type;
+ if (tkn.channel === Token.DEFAULT_CHANNEL) {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.push(tkn) /* .addLast(token) */;
+ }
+
+ private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds
+ const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces
+ let length: number = 0;
+ for (let ch of indentText) {
+ switch (ch) {
+ case " ":
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case "\t":
+ this.wasTabIndentation = true;
+ length += TAB_LENGTH - (length % TAB_LENGTH);
+ break;
+ case "\f": // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation) {
+ if (!this.wasIndentationMixedWithSpacesAndTabs) {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = this.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private reportLexerError(errMsg: string): void {
+ this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined);
+ }
+
+ private reportError(errMsg: string): void {
+ this.reportLexerError(errMsg);
+
+ // the ERRORTOKEN will raise an error in the parser
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!);
+ }
+}
diff --git a/python/python2_7_18/changes.txt b/python/python2_7_18/changes.txt
new file mode 100644
index 0000000000..e945e969ee
--- /dev/null
+++ b/python/python2_7_18/changes.txt
@@ -0,0 +1,4 @@
+Szept 05, 2024
+--------------
+Line continuation for string literals (backslash followed by a newline) is no longer resolved.
+(backslash+newline is no longer removed from string literals)
diff --git a/python/python2_7_18/desc.xml b/python/python2_7_18/desc.xml
index f6ddd173f0..8aa6fdea92 100644
--- a/python/python2_7_18/desc.xml
+++ b/python/python2_7_18/desc.xml
@@ -1,9 +1,9 @@
- ^4.13.1
- CSharp;Java;Python3;JavaScript
+ ^4.13.2
+ CSharp;Java;Python3;JavaScript;TypeScript
- CSharp;Java;Python3;JavaScript
+ CSharp;Java;Python3;JavaScript;TypeScript
file_input
examples
diff --git a/python/python2_7_18/tests/test_error_first_statement_indented.py b/python/python2_7_18/tests/test_error_first_statement_indented.py
index dc70cc8572..39431ac786 100644
--- a/python/python2_7_18/tests/test_error_first_statement_indented.py
+++ b/python/python2_7_18/tests/test_error_first_statement_indented.py
@@ -4,7 +4,7 @@
# EXPECTATIONS:
# - inserted leading INDENT token
# - hidden NEWLINE tokens (channel=1) before the first statement
-# - lexer error message: "line 10:3 first statement indented"
+# - lexer error message: "line 10:3 LEXER ERROR: first statement indented"
i = 1 # first statement begins with space
diff --git a/python/python2_7_18/tests/test_error_inconsistent_dedent.py b/python/python2_7_18/tests/test_error_inconsistent_dedent.py
index 0a74fde76a..660f59ff65 100644
--- a/python/python2_7_18/tests/test_error_inconsistent_dedent.py
+++ b/python/python2_7_18/tests/test_error_inconsistent_dedent.py
@@ -3,7 +3,7 @@
#
# EXPECTATIONS:
# - inserted ERROR_TOKEN instead of the DEDENT token
-# - lexer error message: "line 10:0 inconsistent dedent"
+# - lexer error message: "line 10:0 LEXER ERROR: inconsistent dedent"
if True:
i = 0
diff --git a/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py b/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py
index 493933be68..7d77a9bc0e 100644
--- a/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py
+++ b/python/python2_7_18/tests/test_error_tab_and_space_in_indentation.py
@@ -3,7 +3,7 @@
#
# EXPECTATIONS:
# - inserted ERROR_TOKEN instead of the WS token
-# - lexer error message: "line 11:0 inconsistent use of tabs and spaces in indentation"
+# - lexer error message: "line 11:0 LEXER ERROR: inconsistent use of tabs and spaces in indentation"
if True:
i = 0 # indented by spaces
diff --git a/python/python2_7_18/tests/test_error_unexpected_indent.py b/python/python2_7_18/tests/test_error_unexpected_indent.py
index 9d6bbd3f1f..9fca02bf5d 100644
--- a/python/python2_7_18/tests/test_error_unexpected_indent.py
+++ b/python/python2_7_18/tests/test_error_unexpected_indent.py
@@ -2,7 +2,7 @@
# grun Python file_input -tokens test_error_unexpected_indent.py
#
# EXPECTATION:
-# - parser error message: "line 9:7 extraneous input '' ..."
+# - parser error message: "line 9:7 mismatched input '' ..."
if True:
i = 0
diff --git a/python/python2_7_18/tests/test_explicit_line_joining.py b/python/python2_7_18/tests/test_explicit_line_joining.py
index 011ee61e4b..55be1bd964 100644
--- a/python/python2_7_18/tests/test_explicit_line_joining.py
+++ b/python/python2_7_18/tests/test_explicit_line_joining.py
@@ -2,7 +2,7 @@
# grun Python file_input -tokens test_explicit_line_joining.py
#
# EXPECTATIONS:
-# - hiden (channel=1) LINE_JOINING token
+# - hiden (channel=1) EXPLICIT_LINE_JOINING token
# - no error message
i = 1 \
diff --git a/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py b/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py
index d080bc16fb..9db3798954 100644
--- a/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py
+++ b/python/python2_7_18/tests/test_hidden_NEWLINE_before_comment.py
@@ -6,6 +6,6 @@ def inc(value):
# grun Python file_input -tokens test_hidden_NEWLINE_before_comment.py
#
# EXPECTATIONS:
-# - hidden NEWLINE tokens (channel=1) before a COMMENT (or a TYPE_COMMENT) token
+# - hidden NEWLINE tokens (channel=1) before a COMMENT token
# - hidden NEWLINE token (channel=1) before the blank line
# - no error message
diff --git a/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py b/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py
deleted file mode 100644
index f14d73cb74..0000000000
--- a/python/python2_7_18/tests/test_string_literal_with_newline_escape_sequence.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# COMMAND LINE:
-# grun Python file_input -tokens test_string_literal_with_newline_escape_sequence.py
-#
-# EXPECTATIONS:
-# - removed \ escape sequence from the STRING token
-# - inserted hidden token (channel=1) with the original string literal
-# - no error message
-
-s = 'This string will not include \
-backslashes or newline characters.'
diff --git a/python/python3_12_1/CSharp/PythonLexerBase.cs b/python/python3_12/CSharp/PythonLexerBase.cs
similarity index 83%
rename from python/python3_12_1/CSharp/PythonLexerBase.cs
rename to python/python3_12/CSharp/PythonLexerBase.cs
index aedd7346ed..f67f3a1c62 100644
--- a/python/python3_12_1/CSharp/PythonLexerBase.cs
+++ b/python/python3_12/CSharp/PythonLexerBase.cs
@@ -37,6 +37,7 @@ public abstract class PythonLexerBase : Lexer
private Stack indentLengthStack;
// A list where tokens are waiting to be loaded into the token stream
private LinkedList pendingTokens;
+
// last pending token types
private int previousPendingTokenType;
private int lastPendingTokenTypeFromDefaultChannel;
@@ -49,11 +50,11 @@ public abstract class PythonLexerBase : Lexer
private bool wasSpaceIndentation;
private bool wasTabIndentation;
private bool wasIndentationMixedWithSpacesAndTabs;
- private const int INVALID_LENGTH = -1;
- private CommonToken curToken; // current (under processing) token
- private IToken ffgToken; // following (look ahead) token
+ private IToken curToken; // current (under processing) token
+ private IToken ffgToken; // following (look ahead) token
+ private const int INVALID_LENGTH = -1;
private const string ERR_TXT = " ERROR: ";
protected PythonLexerBase(ICharStream input) : base(input)
@@ -66,6 +67,20 @@ protected PythonLexerBase(ICharStream input, TextWriter output, TextWriter error
this.Init();
}
+ public override IToken NextToken() // reading the input stream until a return EOF
+ {
+ this.CheckNextToken();
+ IToken firstPendingToken = this.pendingTokens.First.Value;
+ this.pendingTokens.RemoveFirst();
+ return firstPendingToken; // add the queued token to the token stream
+ }
+
+ public override void Reset()
+ {
+ this.Init();
+ base.Reset();
+ }
+
private void Init()
{
this.indentLengthStack = new Stack();
@@ -81,14 +96,6 @@ private void Init()
this.ffgToken = null!;
}
- public override IToken NextToken() // reading the input stream until a return EOF
- {
- this.CheckNextToken();
- IToken firstPendingToken = this.pendingTokens.First.Value;
- this.pendingTokens.RemoveFirst();
- return firstPendingToken; // add the queued token to the token stream
- }
-
private void CheckNextToken()
{
if (this.previousPendingTokenType != TokenConstants.EOF)
@@ -116,13 +123,10 @@ private void CheckNextToken()
case PythonLexer.NEWLINE:
this.HandleNEWLINEtoken();
break;
- case PythonLexer.STRING:
- this.HandleSTRINGtoken();
- break;
case PythonLexer.FSTRING_MIDDLE:
this.HandleFSTRING_MIDDLE_token();
break;
- case PythonLexer.ERROR_TOKEN:
+ case PythonLexer.ERRORTOKEN:
this.ReportLexerError("token recognition error at: '" + this.curToken.Text + "'");
this.AddPendingToken(this.curToken);
break;
@@ -140,14 +144,14 @@ private void CheckNextToken()
private void SetCurrentAndFollowingTokens()
{
this.curToken = this.ffgToken == null ?
- new CommonToken(base.NextToken()) :
- new CommonToken(this.ffgToken);
+ base.NextToken() :
+ this.ffgToken;
this.HandleFStringLexerModes();
this.ffgToken = this.curToken.Type == TokenConstants.EOF ?
- this.curToken :
- base.NextToken();
+ this.curToken :
+ base.NextToken();
}
// initialize the _indentLengths
@@ -205,7 +209,7 @@ private void HandleNEWLINEtoken()
}
else
{
- CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ IToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
bool isLookingAhead = this.ffgToken.Type == PythonLexer.WS;
if (isLookingAhead)
{
@@ -214,13 +218,12 @@ private void HandleNEWLINEtoken()
switch (this.ffgToken.Type)
{
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- case PythonLexer.TYPE_COMMENT: // We're before a type comment
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
this.HideAndAddPendingToken(nlToken);
if (isLookingAhead)
{
- this.AddPendingToken(this.curToken); // WS token
+ this.AddPendingToken(this.curToken); // WS token
}
break;
default:
@@ -253,7 +256,6 @@ private void HandleNEWLINEtoken()
private void InsertIndentOrDedentToken(int indentLength)
{
- //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation
int prevIndentLength = this.indentLengthStack.Peek();
if (indentLength > prevIndentLength)
{
@@ -278,25 +280,6 @@ private void InsertIndentOrDedentToken(int indentLength)
}
}
- private void HandleSTRINGtoken()
- {
- // remove the \ escape sequences from the string literal
- // https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals
- string line_joinFreeStringLiteral = Regex.Replace(this.curToken.Text, @"\\\r?\n", "");
- if (this.curToken.Text.Length == line_joinFreeStringLiteral.Length)
- {
- this.AddPendingToken(this.curToken);
- }
- else
- {
- CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token
- this.curToken.Text = line_joinFreeStringLiteral;
- this.AddPendingToken(this.curToken); // add the modified token with inline string literal
- this.HideAndAddPendingToken(originalSTRINGtoken); // add the original token with a hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
private void HandleFSTRING_MIDDLE_token() // replace the double braces '{{' or '}}' to single braces and hide the second braces
{
string fsMid = this.curToken.Text;
@@ -325,7 +308,7 @@ private void HandleFStringLexerModes() // https://peps.python.org/pep-0498/#spe
switch (this.curToken.Type)
{
case PythonLexer.LBRACE:
- this.PushMode(PythonLexer.DEFAULT_MODE);
+ this.PushMode(Lexer.DEFAULT_MODE);
this.paren_or_bracket_openedStack.Push(0);
break;
case PythonLexer.LPAR:
@@ -358,7 +341,7 @@ private void HandleFStringLexerModes() // https://peps.python.org/pep-0498/#spe
case PythonLexer.RBRACE:
switch (CurrentMode)
{
- case PythonLexer.DEFAULT_MODE:
+ case Lexer.DEFAULT_MODE:
case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
this.PopMode();
@@ -412,42 +395,43 @@ private void HandleEOFtoken()
this.AddPendingToken(this.curToken);
}
- private void HideAndAddPendingToken(CommonToken cToken)
+ private void HideAndAddPendingToken(IToken tkn)
{
- cToken.Channel = TokenConstants.HiddenChannel;
- this.AddPendingToken(cToken);
+ CommonToken ctkn = new CommonToken(tkn);
+ ctkn.Channel = TokenConstants.HiddenChannel;
+ this.AddPendingToken(ctkn);
}
- private void CreateAndAddPendingToken(int type, int channel, string text, IToken baseToken)
+ private void CreateAndAddPendingToken(int ttype, int channel, string text, IToken sampleToken)
{
- CommonToken cToken = new CommonToken(baseToken);
- cToken.Type = type;
- cToken.Channel = channel;
- cToken.StopIndex = baseToken.StartIndex - 1;
+ CommonToken ctkn = new CommonToken(sampleToken);
+ ctkn.Type = ttype;
+ ctkn.Channel = channel;
+ ctkn.StopIndex = sampleToken.StartIndex - 1;
- cToken.Text = text == null
- ? "<" + Vocabulary.GetSymbolicName(type) + ">"
+ ctkn.Text = text == null
+ ? "<" + Vocabulary.GetSymbolicName(ttype) + ">"
: text;
- this.AddPendingToken(cToken);
+ this.AddPendingToken(ctkn);
}
- private void AddPendingToken(IToken token)
+ private void AddPendingToken(IToken tkn)
{
// save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.Type;
- if (token.Channel == TokenConstants.DefaultChannel)
+ this.previousPendingTokenType = tkn.Type;
+ if (tkn.Channel == TokenConstants.DefaultChannel)
{
this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
}
- this.pendingTokens.AddLast(token);
+ this.pendingTokens.AddLast(tkn);
}
- private int GetIndentationLength(string textWS) // the textWS may contain spaces, tabs or form feeds
+ private int GetIndentationLength(string indentText) // the indentText may contain spaces, tabs or form feeds
{
const int TAB_LENGTH = 8; // the standard number of spaces to replace a tab with spaces
int length = 0;
- foreach (char ch in textWS)
+ foreach (char ch in indentText)
{
switch (ch)
{
@@ -470,7 +454,7 @@ private int GetIndentationLength(string textWS) // the textWS may contain spaces
if (!this.wasIndentationMixedWithSpacesAndTabs)
{
this.wasIndentationMixedWithSpacesAndTabs = true;
- return PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
+ length = PythonLexerBase.INVALID_LENGTH; // only for the first inconsistent indent
}
}
return length;
@@ -485,13 +469,7 @@ private void ReportError(string errMsg)
{
this.ReportLexerError(errMsg);
- // the ERROR_TOKEN will raise an error in the parser
- this.CreateAndAddPendingToken(PythonLexer.ERROR_TOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
- }
-
- public override void Reset()
- {
- this.Init();
- base.Reset();
+ // the ERRORTOKEN will raise an error in the parser
+ this.CreateAndAddPendingToken(PythonLexer.ERRORTOKEN, TokenConstants.DefaultChannel, PythonLexerBase.ERR_TXT + errMsg, this.ffgToken);
}
}
diff --git a/python/python3_12_1/CSharp/PythonParserBase.cs b/python/python3_12/CSharp/PythonParserBase.cs
similarity index 100%
rename from python/python3_12_1/CSharp/PythonParserBase.cs
rename to python/python3_12/CSharp/PythonParserBase.cs
diff --git a/python/python3_12_1/Java/PythonLexerBase.java b/python/python3_12/Java/PythonLexerBase.java
similarity index 83%
rename from python/python3_12_1/Java/PythonLexerBase.java
rename to python/python3_12/Java/PythonLexerBase.java
index 2f9490617e..7e4f059d7d 100644
--- a/python/python3_12_1/Java/PythonLexerBase.java
+++ b/python/python3_12/Java/PythonLexerBase.java
@@ -49,11 +49,11 @@ public abstract class PythonLexerBase extends Lexer {
private boolean wasSpaceIndentation;
private boolean wasTabIndentation;
private boolean wasIndentationMixedWithSpacesAndTabs;
- private final int INVALID_LENGTH = -1;
- private CommonToken curToken; // current (under processing) token
+ private Token curToken; // current (under processing) token
private Token ffgToken; // following (look ahead) token
+ private final int INVALID_LENGTH = -1;
private final String ERR_TXT = " ERROR: ";
protected PythonLexerBase(CharStream input) {
@@ -61,6 +61,18 @@ protected PythonLexerBase(CharStream input) {
this.init();
}
+ @Override
+ public Token nextToken() { // reading the input stream until a return EOF
+ this.checkNextToken();
+ return this.pendingTokens.pollFirst(); // add the queued token to the token stream
+ }
+
+ @Override
+ public void reset() {
+ this.init();
+ super.reset();
+ }
+
private void init() {
this.indentLengthStack = new ArrayDeque<>();
this.pendingTokens = new LinkedList<>();
@@ -75,12 +87,6 @@ private void init() {
this.ffgToken = null;
}
- @Override
- public Token nextToken() { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.pollFirst(); // add the queued token to the token stream
- }
-
private void checkNextToken() {
if (this.previousPendingTokenType != Token.EOF) {
this.setCurrentAndFollowingTokens();
@@ -104,13 +110,10 @@ private void checkNextToken() {
case PythonLexer.NEWLINE:
this.handleNEWLINEtoken();
break;
- case PythonLexer.STRING:
- this.handleSTRINGtoken();
- break;
case PythonLexer.FSTRING_MIDDLE:
this.handleFSTRING_MIDDLE_token();
break;
- case PythonLexer.ERROR_TOKEN:
+ case PythonLexer.ERRORTOKEN:
this.reportLexerError("token recognition error at: '" + this.curToken.getText() + "'");
this.addPendingToken(this.curToken);
break;
@@ -126,8 +129,8 @@ private void checkNextToken() {
private void setCurrentAndFollowingTokens() {
this.curToken = this.ffgToken == null ?
- new CommonToken(super.nextToken()) :
- new CommonToken(this.ffgToken);
+ super.nextToken() :
+ this.ffgToken;
this.handleFStringLexerModes();
@@ -156,7 +159,8 @@ private void handleStartOfInput() {
this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
}
this.setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with checkNextToken()
+ }
+ // continue the processing of the EOF token with checkNextToken()
}
private void insertLeadingIndentToken() {
@@ -175,19 +179,18 @@ private void handleNEWLINEtoken() {
if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
this.hideAndAddPendingToken(this.curToken);
} else {
- CommonToken nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
+ final Token nlToken = new CommonToken(this.curToken); // save the current NEWLINE token
final boolean isLookingAhead = this.ffgToken.getType() == PythonLexer.WS;
if (isLookingAhead) {
this.setCurrentAndFollowingTokens(); // set the next two tokens
}
switch (this.ffgToken.getType()) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- case PythonLexer.TYPE_COMMENT: // We're before a type comment
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
this.hideAndAddPendingToken(nlToken);
if (isLookingAhead) {
- this.addPendingToken(this.curToken); // WS token
+ this.addPendingToken(this.curToken); // WS token
}
break;
default:
@@ -228,19 +231,6 @@ private void insertIndentOrDedentToken(final int indentLength) {
}
}
- private void handleSTRINGtoken() { // remove the \ escape sequences from the string literal
- final String line_joinFreeStringLiteral = this.curToken.getText().replaceAll("\\\\\\r?\\n", "");
- if (this.curToken.getText().length() == line_joinFreeStringLiteral.length()) {
- this.addPendingToken(this.curToken);
- } else {
- CommonToken originalSTRINGtoken = new CommonToken(this.curToken); // backup the original token
- this.curToken.setText(line_joinFreeStringLiteral);
- this.addPendingToken(this.curToken); // add the modified token with inline string literal
- this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
private void handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces
String fsMid = this.curToken.getText();
fsMid = fsMid.replaceAll("\\{\\{", "{_").replaceAll("}}", "}_"); // replace: {{ --> {_ and }} --> }_
@@ -248,7 +238,7 @@ private void handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or
for (String s : arrOfStr) {
if (!s.isEmpty()) {
this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken);
- String lastCharacter = s.substring(s.length() - 1);
+ final String lastCharacter = s.substring(s.length() - 1);
if ("{}".contains(lastCharacter)) {
this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken);
// this inserted hidden token allows to restore the original f-string literal with the double braces
@@ -261,7 +251,7 @@ private void handleFStringLexerModes() { // https://peps.python.org/pep-0498/#sp
if (!this._modeStack.isEmpty()) {
switch (this.curToken.getType()) {
case PythonLexer.LBRACE:
- this.pushMode(PythonLexer.DEFAULT_MODE);
+ this.pushMode(Lexer.DEFAULT_MODE);
this.paren_or_bracket_openedStack.push(0);
break;
case PythonLexer.LPAR:
@@ -291,7 +281,7 @@ private void handleFStringLexerModes() { // https://peps.python.org/pep-0498/#sp
break;
case PythonLexer.RBRACE:
switch (this._mode) {
- case PythonLexer.DEFAULT_MODE:
+ case Lexer.DEFAULT_MODE:
case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
this.popMode();
@@ -339,36 +329,37 @@ private void handleEOFtoken() {
this.addPendingToken(this.curToken);
}
- private void hideAndAddPendingToken(CommonToken cToken) {
- cToken.setChannel(Token.HIDDEN_CHANNEL);
- this.addPendingToken(cToken);
+ private void hideAndAddPendingToken(final Token tkn) {
+ CommonToken ctkn = new CommonToken(tkn);
+ ctkn.setChannel(Token.HIDDEN_CHANNEL);
+ this.addPendingToken(ctkn);
}
- private void createAndAddPendingToken(final int type, final int channel, final String text, Token baseToken) {
- CommonToken cToken = new CommonToken(baseToken);
- cToken.setType(type);
- cToken.setChannel(channel);
- cToken.setStopIndex(baseToken.getStartIndex() - 1);
- cToken.setText(text == null
- ? "<" + this.getVocabulary().getSymbolicName(type) + ">"
- : text);
+ private void createAndAddPendingToken(final int ttype, final int channel, final String text, Token sampleToken) {
+ CommonToken ctkn = new CommonToken(sampleToken);
+ ctkn.setType(ttype);
+ ctkn.setChannel(channel);
+ ctkn.setStopIndex(sampleToken.getStartIndex() - 1);
+ ctkn.setText(text == null
+ ? "<" + this.getVocabulary().getDisplayName(ttype) + ">"
+ : text);
- this.addPendingToken(cToken);
+ this.addPendingToken(ctkn);
}
- private void addPendingToken(final Token token) {
+ private void addPendingToken(final Token tkn) {
// save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.getType();
- if (token.getChannel() == Token.DEFAULT_CHANNEL) {
+ this.previousPendingTokenType = tkn.getType();
+ if (tkn.getChannel() == Token.DEFAULT_CHANNEL) {
this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
}
- this.pendingTokens.addLast(token);
+ this.pendingTokens.addLast(tkn);
}
- private int getIndentationLength(final String textWS) { // the textWS may contain spaces, tabs or form feeds
+ private int getIndentationLength(final String indentText) { // the indentText may contain spaces, tabs or form feeds
final int TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
int length = 0;
- for (char ch : textWS.toCharArray()) {
+ for (char ch : indentText.toCharArray()) {
switch (ch) {
case ' ':
this.wasSpaceIndentation = true;
@@ -387,7 +378,7 @@ private int getIndentationLength(final String textWS) { // the textWS may contai
if (this.wasTabIndentation && this.wasSpaceIndentation) {
if (!(this.wasIndentationMixedWithSpacesAndTabs)) {
this.wasIndentationMixedWithSpacesAndTabs = true;
- return this.INVALID_LENGTH; // only for the first inconsistent indent
+ length = this.INVALID_LENGTH; // only for the first inconsistent indent
}
}
return length;
@@ -400,13 +391,7 @@ private void reportLexerError(final String errMsg) {
private void reportError(final String errMsg) {
this.reportLexerError(errMsg);
- // the ERROR_TOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
- }
-
- @Override
- public void reset() {
- this.init();
- super.reset();
+ // the ERRORTOKEN will raise an error in the parser
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
}
}
diff --git a/python/python3_12_1/Java/PythonParserBase.java b/python/python3_12/Java/PythonParserBase.java
similarity index 100%
rename from python/python3_12_1/Java/PythonParserBase.java
rename to python/python3_12/Java/PythonParserBase.java
diff --git a/python/python3_12_1/JavaScript/PythonLexerBase.js b/python/python3_12/JavaScript/PythonLexerBase.js
similarity index 59%
rename from python/python3_12_1/JavaScript/PythonLexerBase.js
rename to python/python3_12/JavaScript/PythonLexerBase.js
index 2a7bebaf95..2c1ea71d47 100644
--- a/python/python3_12_1/JavaScript/PythonLexerBase.js
+++ b/python/python3_12/JavaScript/PythonLexerBase.js
@@ -27,7 +27,7 @@ THE SOFTWARE.
*
*/
-import { Token, CommonToken, Lexer } from "antlr4";
+import { Token, Lexer } from "antlr4";
import PythonLexer from "./PythonLexer.js";
export default class PythonLexerBase extends Lexer {
@@ -51,17 +51,27 @@ export default class PythonLexerBase extends Lexer {
this.wasSpaceIndentation;
this.wasTabIndentation;
this.wasIndentationMixedWithSpacesAndTabs;
- const INVALID_LENGTH = -1;
-
+
this.curToken; // current (under processing) token
this.ffgToken; // following (look ahead) token
- const ERR_TXT = " ERROR: ";
+ this.#init();
+ }
+
+ get #INVALID_LENGTH() { return -1; }
+ get #ERR_TXT() { return " ERROR: "; }
- this.init();
+ nextToken() { // reading the input stream until a return EOF
+ this.#checkNextToken();
+ return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream
}
- init() {
+ reset() {
+ this.#init();
+ super.reset();
+ }
+
+ #init() {
this.indentLengthStack = [];
this.pendingTokens = [];
this.previousPendingTokenType = 0;
@@ -75,16 +85,11 @@ export default class PythonLexerBase extends Lexer {
this.ffgToken = null;
}
- nextToken() { // reading the input stream until a return EOF
- this.checkNextToken();
- return this.pendingTokens.shift() /* .pollFirst() */; // add the queued token to the token stream
- }
-
- checkNextToken() {
+ #checkNextToken() {
if (this.previousPendingTokenType !== Token.EOF) {
- this.setCurrentAndFollowingTokens();
+ this.#setCurrentAndFollowingTokens();
if (this.indentLengthStack.length === 0) { // We're at the first token
- this.handleStartOfInput();
+ this.#handleStartOfInput();
}
switch (this.curToken.type) {
@@ -92,175 +97,158 @@ export default class PythonLexerBase extends Lexer {
case PythonLexer.LSQB:
case PythonLexer.LBRACE:
this.opened++;
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
break;
case PythonLexer.RPAR:
case PythonLexer.RSQB:
case PythonLexer.RBRACE:
this.opened--;
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
break;
case PythonLexer.NEWLINE:
- this.handleNEWLINEtoken();
- break;
- case PythonLexer.STRING:
- this.handleSTRINGtoken();
+ this.#handleNEWLINEtoken();
break;
case PythonLexer.FSTRING_MIDDLE:
- this.handleFSTRING_MIDDLE_token();
+ this.#handleFSTRING_MIDDLE_token();
break;
- case PythonLexer.ERROR_TOKEN:
- this.reportLexerError(`token recognition error at: '${this.curToken.text}'`);
- this.addPendingToken(this.curToken);
+ case PythonLexer.ERRORTOKEN:
+ this.#reportLexerError(`token recognition error at: '${this.curToken.text}'`);
+ this.#addPendingToken(this.curToken);
break;
case Token.EOF:
- this.handleEOFtoken();
+ this.#handleEOFtoken();
break;
default:
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
}
- this.handleFORMAT_SPECIFICATION_MODE();
+ this.#handleFORMAT_SPECIFICATION_MODE();
}
}
- setCurrentAndFollowingTokens() {
+ #setCurrentAndFollowingTokens() {
this.curToken = this.ffgToken == undefined ?
- this.getCommonTokenByToken(super.nextToken()) :
- this.getCommonTokenByToken(this.ffgToken);
+ super.nextToken() :
+ this.ffgToken;
- this.handleFStringLexerModes();
+ this.#handleFStringLexerModes();
this.ffgToken = this.curToken.type === Token.EOF ?
this.curToken :
- this.getCommonTokenByToken(super.nextToken());
+ super.nextToken();
}
// initialize the _indentLengthStack
// hide the leading NEWLINE token(s)
// if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
// insert a leading INDENT token if necessary
- handleStartOfInput() {
+ #handleStartOfInput() {
// initialize the stack with a default 0 indentation length
this.indentLengthStack.push(0); // this will never be popped off
while (this.curToken.type !== Token.EOF) {
if (this.curToken.channel === Token.DEFAULT_CHANNEL) {
if (this.curToken.type === PythonLexer.NEWLINE) {
// all the NEWLINE tokens must be ignored before the first statement
- this.hideAndAddPendingToken(this.curToken);
+ this.#hideAndAddPendingToken(this.curToken);
} else { // We're at the first statement
- this.insertLeadingIndentToken();
- return; // continue the processing of the current token with checkNextToken()
+ this.#insertLeadingIndentToken();
+ return; // continue the processing of the current token with #checkNextToken()
}
} else {
- this.addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ this.#addPendingToken(this.curToken); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
}
- this.setCurrentAndFollowingTokens();
- } // continue the processing of the EOF token with checkNextToken()
+ this.#setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with #checkNextToken()
}
- insertLeadingIndentToken() {
+ #insertLeadingIndentToken() {
if (this.previousPendingTokenType === PythonLexer.WS) {
let prevToken = this.pendingTokens.at(- 1) /* .peekLast() */; // WS token
- if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ if (this.#getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
const errMsg = "first statement indented";
- this.reportLexerError(errMsg);
+ this.#reportLexerError(errMsg);
// insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken);
+ this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.curToken);
}
}
}
- handleNEWLINEtoken() {
+ #handleNEWLINEtoken() {
if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
- this.hideAndAddPendingToken(this.curToken);
+ this.#hideAndAddPendingToken(this.curToken);
} else {
- let nlToken = this.getCommonTokenByToken(this.curToken); // save the current NEWLINE token
+ let nlToken = this.curToken.clone(); // save the current NEWLINE token
const isLookingAhead = this.ffgToken.type === PythonLexer.WS;
if (isLookingAhead) {
- this.setCurrentAndFollowingTokens(); // set the next two tokens
+ this.#setCurrentAndFollowingTokens(); // set the next two tokens
}
switch (this.ffgToken.type) {
- case PythonLexer.NEWLINE: // We're before a blank line
- case PythonLexer.COMMENT: // We're before a comment
- case PythonLexer.TYPE_COMMENT: // We're before a type comment
- this.hideAndAddPendingToken(nlToken);
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.#hideAndAddPendingToken(nlToken);
if (isLookingAhead) {
- this.addPendingToken(this.curToken); // WS token
+ this.#addPendingToken(this.curToken); // WS token
}
break;
default:
- this.addPendingToken(nlToken);
+ this.#addPendingToken(nlToken);
if (isLookingAhead) { // We're on whitespace(s) followed by a statement
const indentationLength = this.ffgToken.type === Token.EOF ?
- 0 :
- this.getIndentationLength(this.curToken.text);
+ 0 :
+ this.#getIndentationLength(this.curToken.text);
- if (indentationLength !== this.INVALID_LENGTH) {
- this.addPendingToken(this.curToken); // WS token
- this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ if (indentationLength !== this.#INVALID_LENGTH) {
+ this.#addPendingToken(this.curToken); // WS token
+ this.#insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
} else {
- this.reportError("inconsistent use of tabs and spaces in indentation");
+ this.#reportError("inconsistent use of tabs and spaces in indentation");
}
} else { // We're at a newline followed by a statement (there is no whitespace before the statement)
- this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ this.#insertIndentOrDedentToken(0); // may insert DEDENT token(s)
}
}
}
}
- insertIndentOrDedentToken(curIndentLength) {
+ #insertIndentOrDedentToken(curIndentLength) {
let prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
if (curIndentLength > prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
this.indentLengthStack.push(curIndentLength);
} else {
while (curIndentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
this.indentLengthStack.pop();
prevIndentLength = this.indentLengthStack.at(-1) /* peek() */;
if (curIndentLength <= prevIndentLength) {
- this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken);
} else {
- this.reportError("inconsistent dedent");
+ this.#reportError("inconsistent dedent");
}
}
}
}
- handleSTRINGtoken() { // remove the \ escape sequences from the string literal
- const line_joinFreeStringLiteral = this.curToken.text.replace(/\\(\r?\n)/g, "");
- if (this.curToken.text.length === line_joinFreeStringLiteral.length) {
- this.addPendingToken(this.curToken);
- } else {
- let originalSTRINGtoken = this.getCommonTokenByToken(this.curToken); // backup the original token
- this.curToken.text = line_joinFreeStringLiteral;
- this.addPendingToken(this.curToken); // add the modified token with inline string literal
- this.hideAndAddPendingToken(originalSTRINGtoken); // add the original token to the hidden channel
- // this inserted hidden token allows to restore the original string literal with the \ escape sequences
- }
- }
-
- handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces
+ #handleFSTRING_MIDDLE_token() { // replace the double braces '{{' or '}}' to single braces and hide the second braces
let fsMid = this.curToken.text;
fsMid = fsMid.replaceAll(/\{\{/g, "{_").replaceAll(/\}\}/g, "}_"); // replace: {{ --> {_ and }} --> }_
let arrOfStr = fsMid.split(/(?<=[{}])_/); // split by {_ or }_
for (let s of arrOfStr) {
if (s) {
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken);
let lastCharacter = s.charAt(s.length - 1);
if ("{}".includes(lastCharacter)) {
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken);
// this inserted hidden token allows to restore the original f-string literal with the double braces
}
}
}
}
- handleFStringLexerModes() { // https://peps.python.org/pep-0498/#specification
+ #handleFStringLexerModes() { // https://peps.python.org/pep-0498/#specification
if (this._modeStack.length > 0) {
switch (this.curToken.type) {
case PythonLexer.LBRACE:
- this.pushMode(PythonLexer.DEFAULT_MODE);
+ this.pushMode(Lexer.DEFAULT_MODE);
this.paren_or_bracket_openedStack.push(0);
break;
case PythonLexer.LPAR:
@@ -278,26 +266,26 @@ export default class PythonLexerBase extends Lexer {
case PythonLexer.SINGLE_QUOTE_FSTRING_MODE:
case PythonLexer.LONG_SINGLE_QUOTE_FSTRING_MODE:
case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
- this.mode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ this.setMode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
break;
case PythonLexer.DOUBLE_QUOTE_FSTRING_MODE:
case PythonLexer.LONG_DOUBLE_QUOTE_FSTRING_MODE:
case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
- this.mode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ this.setMode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
break;
}
}
break;
case PythonLexer.RBRACE:
switch (this._mode) {
- case PythonLexer.DEFAULT_MODE:
+ case Lexer.DEFAULT_MODE:
case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
this.popMode();
this.paren_or_bracket_openedStack.pop();
break;
default:
- this.reportLexerError("f-string: single '}' is not allowed");
+ this.#reportLexerError("f-string: single '}' is not allowed");
break;
}
break;
@@ -305,78 +293,68 @@ export default class PythonLexerBase extends Lexer {
}
}
- handleFORMAT_SPECIFICATION_MODE() {
+ #handleFORMAT_SPECIFICATION_MODE() {
if (this._modeStack.length > 0 && this.ffgToken.type === PythonLexer.RBRACE) {
switch (this.curToken.type) {
case PythonLexer.COLON:
case PythonLexer.RBRACE:
// insert an empty FSTRING_MIDDLE token instead of the missing format specification
- this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
+ this.#createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken);
break;
}
}
}
- insertTrailingTokens() {
+ #insertTrailingTokens() {
switch (this.lastPendingTokenTypeFromDefaultChannel) {
case PythonLexer.NEWLINE:
case PythonLexer.DEDENT:
break; // no trailing NEWLINE token is needed
default:
// insert an extra trailing NEWLINE token that serves as the end of the last statement
- this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF
+ this.#createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken); // _ffgToken is EOF
}
- this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ this.#insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
}
- handleEOFtoken() {
+ #handleEOFtoken() {
if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
// there was a statement in the input (leading NEWLINE tokens are hidden)
- this.insertTrailingTokens();
+ this.#insertTrailingTokens();
}
- this.addPendingToken(this.curToken);
+ this.#addPendingToken(this.curToken);
}
- hideAndAddPendingToken(cToken) {
- cToken.channel = Token.HIDDEN_CHANNEL;
- this.addPendingToken(cToken);
+ #hideAndAddPendingToken(ctkn) {
+ ctkn.channel = Token.HIDDEN_CHANNEL;
+ this.#addPendingToken(ctkn);
}
- createAndAddPendingToken(type, channel, text, baseToken) {
- const cToken = this.getCommonTokenByToken(baseToken);
- cToken.type = type;
- cToken.channel = channel;
- cToken.stop = baseToken.start - 1;
- cToken.text = text == null ?
+ #createAndAddPendingToken(type, channel, text, sampleToken) {
+ const ctkn = sampleToken.clone();
+ ctkn.type = type;
+ ctkn.channel = channel;
+ ctkn.stop = sampleToken.start - 1;
+ ctkn.text = text == null ?
`<${this.getSymbolicNames()[type]}>` :
text;
- this.addPendingToken(cToken);
+ this.#addPendingToken(ctkn);
}
- addPendingToken(token) {
+ #addPendingToken(tkn) {
// save the last pending token type because the _pendingTokens linked list can be empty by the nextToken()
- this.previousPendingTokenType = token.type;
- if (token.channel === Token.DEFAULT_CHANNEL) {
+ this.previousPendingTokenType = tkn.type;
+ if (tkn.channel === Token.DEFAULT_CHANNEL) {
this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
}
- this.pendingTokens.push(token) /* .addLast(token) */;
- }
-
- getCommonTokenByToken(oldToken) {
- let commonToken = new CommonToken(oldToken.source, oldToken.type, oldToken.channel, oldToken.start, oldToken.stop);
- commonToken.tokenIndex = oldToken.tokenIndex;
- commonToken.line = oldToken.line;
- commonToken.column = oldToken.column;
- commonToken.text = oldToken.text;
- return commonToken;
+ this.pendingTokens.push(tkn) /* .addLast(token) */;
}
- getIndentationLength(textWS) { // the textWS may contain spaces, tabs or form feeds
+ #getIndentationLength(indentText) { // the indentText may contain spaces, tabs or form feeds
const TAB_LENGTH = 8; // the standard number of spaces to replace a tab to spaces
let length = 0;
-
- for (let ch of textWS) {
+ for (let ch of indentText) {
switch (ch) {
case " ":
this.wasSpaceIndentation = true;
@@ -395,25 +373,20 @@ export default class PythonLexerBase extends Lexer {
if (this.wasTabIndentation && this.wasSpaceIndentation) {
if (!this.wasIndentationMixedWithSpacesAndTabs) {
this.wasIndentationMixedWithSpacesAndTabs = true;
- return this.INVALID_LENGTH; // only for the first inconsistent indent
+ length = this.#INVALID_LENGTH; // only for the first inconsistent indent
}
}
return length;
}
- reportLexerError(errMsg) {
- this.getErrorListenerDispatch().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.ERR_TXT + errMsg, null);
+ #reportLexerError(errMsg) {
+ this.getErrorListener().syntaxError(this, this.curToken, this.curToken.line, this.curToken.column, " LEXER" + this.#ERR_TXT + errMsg, null);
}
- reportError(errMsg) {
- this.reportLexerError(errMsg);
-
- // the ERROR_TOKEN will raise an error in the parser
- this.createAndAddPendingToken(PythonLexer.ERROR_TOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken);
- }
+ #reportError(errMsg) {
+ this.#reportLexerError(errMsg);
- reset() {
- this.init();
- super.reset();
+ // the ERRORTOKEN will raise an error in the parser
+ this.#createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.#ERR_TXT + errMsg, this.ffgToken);
}
}
diff --git a/python/python3_12_1/JavaScript/PythonParserBase.js b/python/python3_12/JavaScript/PythonParserBase.js
similarity index 100%
rename from python/python3_12_1/JavaScript/PythonParserBase.js
rename to python/python3_12/JavaScript/PythonParserBase.js
diff --git a/python/python3_12/Python3/PythonLexerBase.py b/python/python3_12/Python3/PythonLexerBase.py
new file mode 100644
index 0000000000..d9a95ea764
--- /dev/null
+++ b/python/python3_12/Python3/PythonLexerBase.py
@@ -0,0 +1,309 @@
+# The MIT License (MIT)
+# Copyright (c) 2021 Robert Einhorn
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Project : Python Indent/Dedent handler for ANTLR4 grammars
+#
+# Developed by : Robert Einhorn
+
+from typing import TextIO
+from antlr4 import InputStream, Lexer, Token
+from antlr4.Token import CommonToken
+import sys
+import re
+
+class PythonLexerBase(Lexer):
+ def __init__(self, input: InputStream, output: TextIO = sys.stdout):
+ super().__init__(input, output)
+
+ # A stack that keeps track of the indentation lengths
+ self.__indent_length_stack: list[int]
+
+ # A list where tokens are waiting to be loaded into the token stream
+ self.__pending_tokens: list[CommonToken]
+
+ # last pending token types
+ self.__previous_pending_token_type: int
+ self.__last_pending_token_type_from_default_channel: int
+
+ # The amount of opened parentheses, square brackets or curly braces
+ self.__opened: int
+ # The amount of opened parentheses and square brackets in the current lexer mode
+ self.__paren_or_bracket_opened_stack: list[int]
+
+ self.__was_space_indentation: bool
+ self.__was_tab_indentation: bool
+ self.__was_indentation_mixed_with_spaces_and_tabs: bool
+
+ self.__cur_token: CommonToken # current (under processing) token
+ self.__ffg_token: CommonToken # following (look ahead) token
+
+ self.__INVALID_LENGTH: int = -1
+ self.__ERR_TXT: str = " ERROR: "
+
+ self.__init()
+
+ def nextToken(self) -> CommonToken: # reading the input stream until a return EOF
+ self.__check_next_token()
+ return self.__pending_tokens.pop(0) # add the queued token to the token stream
+
+ def reset(self) -> None:
+ self.__init()
+ super().reset()
+
+ def __init(self) -> None:
+ self.__indent_length_stack = []
+ self.__pending_tokens = []
+ self.__previous_pending_token_type = 0
+ self.__last_pending_token_type_from_default_channel = 0
+ self.__opened = 0
+ self.__paren_or_bracket_opened_stack = []
+ self.__was_space_indentation = False
+ self.__was_tab_indentation = False
+ self.__was_indentation_mixed_with_spaces_and_tabs = False
+ self.__cur_token = None
+ self.__ffg_token = None
+
+ def __check_next_token(self) -> None:
+ if self.__previous_pending_token_type != Token.EOF:
+ self.__set_current_and_following_tokens()
+ if len(self.__indent_length_stack) == 0: # We're at the first token
+ self.__handle_start_of_input()
+
+ match self.__cur_token.type:
+ case self.LPAR | self.LSQB | self.LBRACE:
+ self.__opened += 1
+ self.__add_pending_token(self.__cur_token)
+ case self.RPAR | self.RSQB | self.RBRACE:
+ self.__opened -= 1
+ self.__add_pending_token(self.__cur_token)
+ case self.NEWLINE:
+ self.__handle_NEWLINE_token()
+ case self.FSTRING_MIDDLE:
+ self.__handle_FSTRING_MIDDLE_token()
+ case self.ERRORTOKEN:
+ self.__report_lexer_error("token recognition error at: '" + self.__cur_token.text + "'")
+ self.__add_pending_token(self.__cur_token)
+ case Token.EOF:
+ self.__handle_EOF_token()
+ case other:
+ self.__add_pending_token(self.__cur_token)
+ self.__handle_FORMAT_SPECIFICATION_MODE()
+
+ def __set_current_and_following_tokens(self) -> None:
+ self.__cur_token = super().nextToken() if self.__ffg_token is None else \
+ self.__ffg_token
+
+ self.__handle_fstring_lexer_modes()
+
+ self.__ffg_token = self.__cur_token if self.__cur_token.type == Token.EOF else \
+ super().nextToken()
+
+ # initialize the _indent_length_stack
+ # hide the leading NEWLINE token(s)
+ # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
+ # insert a leading INDENT token if necessary
+ def __handle_start_of_input(self) -> None:
+ # initialize the stack with a default 0 indentation length
+ self.__indent_length_stack.append(0) # this will never be popped off
+ while self.__cur_token.type != Token.EOF:
+ if self.__cur_token.channel == Token.DEFAULT_CHANNEL:
+ if self.__cur_token.type == self.NEWLINE:
+ # all the NEWLINE tokens must be ignored before the first statement
+ self.__hide_and_add_pending_token(self.__cur_token)
+ else: # We're at the first statement
+ self.__insert_leading_indent_token()
+ return # continue the processing of the current token with __check_next_token()
+ else:
+ self.__add_pending_token(self.__cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ self.__set_current_and_following_tokens()
+ # continue the processing of the EOF token with __check_next_token()
+
+ def __insert_leading_indent_token(self) -> None:
+ if self.__previous_pending_token_type == self.WS:
+ prev_token: CommonToken = self.__pending_tokens[-1] # WS token
+ if self.__get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
+ err_msg: str = "first statement indented"
+ self.__report_lexer_error(err_msg)
+ # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
+ self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__cur_token)
+
+ def __handle_NEWLINE_token(self) -> None:
+ if self.__opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
+ self.__hide_and_add_pending_token(self.__cur_token)
+ else:
+ nl_token: CommonToken = self.__cur_token.clone() # save the current NEWLINE token
+ is_looking_ahead: bool = self.__ffg_token.type == self.WS
+ if is_looking_ahead:
+ self.__set_current_and_following_tokens() # set the next two tokens
+
+ match self.__ffg_token.type:
+ case self.NEWLINE | self.COMMENT:
+ # We're before a blank line or a comment or type comment or a type ignore comment
+ self.__hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
+ if is_looking_ahead:
+ self.__add_pending_token(self.__cur_token) # WS token
+ case other:
+ self.__add_pending_token(nl_token)
+ if is_looking_ahead: # We're on a whitespace(s) followed by a statement
+ indentation_length: int = 0 if self.__ffg_token.type == Token.EOF else \
+ self.__get_indentation_length(self.__cur_token.text)
+
+ if indentation_length != self.__INVALID_LENGTH:
+ self.__add_pending_token(self.__cur_token) # WS token
+ self.__insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s)
+ else:
+ self.__report_error("inconsistent use of tabs and spaces in indentation")
+ else: # We're at a newline followed by a statement (there is no whitespace before the statement)
+ self.__insert_indent_or_dedent_token(0) # may insert DEDENT token(s)
+
+ def __insert_indent_or_dedent_token(self, indent_length: int) -> None:
+ prev_indent_length: int = self.__indent_length_stack[-1] # peek()
+ if indent_length > prev_indent_length:
+ self.__create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
+ self.__indent_length_stack.append(indent_length)
+ else:
+ while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream
+ self.__indent_length_stack.pop()
+ prev_indent_length = self.__indent_length_stack[-1] # peek()
+ if indent_length <= prev_indent_length:
+ self.__create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.__ffg_token)
+ else:
+ self.__report_error("inconsistent dedent")
+
+ def __handle_FSTRING_MIDDLE_token(self) -> None: # replace the double braces '{{' or '}}' to single braces and hide the second braces
+ fs_mid: str = self.__cur_token.text
+ fs_mid = fs_mid.replace("{{", "{_").replace("}}", "}_") # replace: {{ --> {_ and }} --> }_
+ arr_of_str: list[str] = re.split(r"(?<=[{}])_", fs_mid) # split by {_ or }_
+ s: str
+ for s in arr_of_str:
+ if s:
+ self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, self.__ffg_token)
+ last_character: str = s[-1:]
+ if last_character in "{}":
+ self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, last_character, self.__ffg_token)
+
+ def __handle_fstring_lexer_modes(self) -> None:
+ if self._modeStack:
+ match self.__cur_token.type:
+ case self.LBRACE:
+ self.pushMode(Lexer.DEFAULT_MODE)
+ self.__paren_or_bracket_opened_stack.append(0)
+ case self.LPAR | self.LSQB:
+ # https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ self.__paren_or_bracket_opened_stack[-1] += 1 # increment the last element (peek() + 1)
+ case self.RPAR | self.RSQB:
+ self.__paren_or_bracket_opened_stack[-1] -= 1 # decrement the last element (peek() - 1)
+ case self.COLON:
+ if self.__paren_or_bracket_opened_stack[-1] == 0:
+ match self._modeStack[-1]: # check the previous lexer mode (the current is DEFAULT_MODE)
+ case self.SINGLE_QUOTE_FSTRING_MODE \
+ | self.LONG_SINGLE_QUOTE_FSTRING_MODE \
+ | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+
+ self.mode(self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
+ case self.DOUBLE_QUOTE_FSTRING_MODE \
+ | self.LONG_DOUBLE_QUOTE_FSTRING_MODE \
+ | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+
+ self.mode(self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
+ case self.RBRACE:
+ match self._mode:
+ case Lexer.DEFAULT_MODE \
+ | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE \
+ | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+
+ self.popMode()
+ self.__paren_or_bracket_opened_stack.pop()
+ case other:
+ self.__report_lexer_error("f-string: single '}' is not allowed")
+
+ def __handle_FORMAT_SPECIFICATION_MODE(self) -> None:
+ if len(self._modeStack) != 0 \
+ and self.__ffg_token.type == self.RBRACE:
+
+ match self.__cur_token.type:
+ case self.COLON | self.RBRACE:
+ # insert an empty FSTRING_MIDDLE token instead of the missing format specification
+ self.__create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.__ffg_token)
+
+ def __insert_trailing_tokens(self) -> None:
+ match self.__last_pending_token_type_from_default_channel:
+ case self.NEWLINE | self.DEDENT:
+ pass # no trailing NEWLINE token is needed
+ case other:
+ # insert an extra trailing NEWLINE token that serves as the end of the last statement
+ self.__create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.__ffg_token) # _ffg_token is EOF
+ self.__insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed
+
+ def __handle_EOF_token(self) -> None:
+ if self.__last_pending_token_type_from_default_channel > 0:
+ # there was statement in the input (leading NEWLINE tokens are hidden)
+ self.__insert_trailing_tokens()
+ self.__add_pending_token(self.__cur_token)
+
+ def __hide_and_add_pending_token(self, ctkn: CommonToken) -> None:
+ ctkn.channel = Token.HIDDEN_CHANNEL
+ self.__add_pending_token(ctkn)
+
+ def __create_and_add_pending_token(self, ttype: int, channel: int, text: str, sample_token: CommonToken) -> None:
+ ctkn: CommonToken = sample_token.clone()
+ ctkn.type = ttype
+ ctkn.channel = channel
+ ctkn.stop = sample_token.start - 1
+ ctkn.text = "<" + self.symbolicNames[ttype] + ">" if text is None else \
+ text
+
+ self.__add_pending_token(ctkn)
+
+ def __add_pending_token(self, ctkn: CommonToken) -> None:
+ # save the last pending token type because the _pending_tokens list can be empty by the nextToken()
+ self.__previous_pending_token_type = ctkn.type
+ if ctkn.channel == Token.DEFAULT_CHANNEL:
+ self.__last_pending_token_type_from_default_channel = self.__previous_pending_token_type
+ self.__pending_tokens.append(ctkn)
+
+ def __get_indentation_length(self, indentText: str) -> int: # the indentText may contain spaces, tabs or form feeds
+ TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab to spaces
+ length: int = 0
+ ch: str
+ for ch in indentText:
+ match ch:
+ case ' ':
+ self.__was_space_indentation = True
+ length += 1
+ case '\t':
+ self.__was_tab_indentation = True
+ length += TAB_LENGTH - (length % TAB_LENGTH)
+ case '\f': # form feed
+ length = 0
+
+ if self.__was_tab_indentation and self.__was_space_indentation:
+ if not self.__was_indentation_mixed_with_spaces_and_tabs:
+ self.__was_indentation_mixed_with_spaces_and_tabs = True
+ length = self.__INVALID_LENGTH # only for the first inconsistent indent
+ return length
+
+ def __report_lexer_error(self, err_msg: str) -> None:
+ self.getErrorListenerDispatch().syntaxError(self, self.__cur_token, self.__cur_token.line, self.__cur_token.column, " LEXER" + self.__ERR_TXT + err_msg, None)
+
+ def __report_error(self, err_msg: str) -> None:
+ self.__report_lexer_error(err_msg)
+
+ # the ERRORTOKEN will raise an error in the parser
+ self.__create_and_add_pending_token(self.ERRORTOKEN, Token.DEFAULT_CHANNEL, self.__ERR_TXT + err_msg, self.__ffg_token)
diff --git a/python/python3_12_1/Python3/PythonParserBase.py b/python/python3_12/Python3/PythonParserBase.py
similarity index 100%
rename from python/python3_12_1/Python3/PythonParserBase.py
rename to python/python3_12/Python3/PythonParserBase.py
diff --git a/python/python3_12_1/Python3/README.md b/python/python3_12/Python3/README.md
similarity index 100%
rename from python/python3_12_1/Python3/README.md
rename to python/python3_12/Python3/README.md
diff --git a/python/python3_12_1/Python3/transformGrammar.py b/python/python3_12/Python3/transformGrammar.py
similarity index 100%
rename from python/python3_12_1/Python3/transformGrammar.py
rename to python/python3_12/Python3/transformGrammar.py
diff --git a/python/python3_12_1/Python3_12_1_official_grammar.peg b/python/python3_12/Python3_12_6_official_grammar.peg
similarity index 99%
rename from python/python3_12_1/Python3_12_1_official_grammar.peg
rename to python/python3_12/Python3_12_6_official_grammar.peg
index 49d4a9ad34..8714b514d1 100644
--- a/python/python3_12_1/Python3_12_1_official_grammar.peg
+++ b/python/python3_12/Python3_12_6_official_grammar.peg
@@ -1,3 +1,7 @@
+# PEG grammar for Python
+
+
+
# ========================= START OF THE GRAMMAR =========================
# General grammatical elements and rules:
@@ -494,9 +498,7 @@ type_param_seq: ','.type_param+ [',']
type_param:
| NAME [type_param_bound]
- | '*' NAME ':' expression
| '*' NAME
- | '**' NAME ':' expression
| '**' NAME
type_param_bound: ':' expression
diff --git a/python/python3_12_1/PythonLexer.g4 b/python/python3_12/PythonLexer.g4
similarity index 98%
rename from python/python3_12_1/PythonLexer.g4
rename to python/python3_12/PythonLexer.g4
index e3a5ed3ecb..ead8b3c89e 100644
--- a/python/python3_12_1/PythonLexer.g4
+++ b/python/python3_12/PythonLexer.g4
@@ -32,6 +32,7 @@ options { superClass=PythonLexerBase; }
tokens {
INDENT, DEDENT // https://docs.python.org/3.12/reference/lexical_analysis.html#indentation
+ , TYPE_COMMENT // not supported, only for compatibility with the PythonParser.g4 grammar
, FSTRING_START, FSTRING_MIDDLE, FSTRING_END // https://peps.python.org/pep-0701/#specification
}
@@ -147,15 +148,8 @@ STRING
| BYTES_LITERAL
;
-// https://peps.python.org/pep-0484/#type-comments
-TYPE_COMMENT
- : '#' WS? 'type:' ~[\r\n]*
- ;
-
// https://docs.python.org/3.12/reference/lexical_analysis.html#physical-lines
-NEWLINE
- : OS_INDEPENDENT_NL
- ;
+NEWLINE : '\r'? '\n'; // Unix, Windows
// https://docs.python.org/3.12/reference/lexical_analysis.html#comments
COMMENT : '#' ~[\r\n]* -> channel(HIDDEN);
@@ -172,7 +166,7 @@ DOUBLE_QUOTE_FSTRING_START : F_STRING_PREFIX ["] -> type(FSTRING_STAR
LONG_SINGLE_QUOTE_FSTRING_START : F_STRING_PREFIX ['][']['] -> type(FSTRING_START), pushMode(LONG_SINGLE_QUOTE_FSTRING_MODE);
LONG_DOUBLE_QUOTE_FSTRING_START : F_STRING_PREFIX ["]["]["] -> type(FSTRING_START), pushMode(LONG_DOUBLE_QUOTE_FSTRING_MODE);
-ERROR_TOKEN : . ; // catch the unrecognized characters and redirect these errors to the parser
+ERRORTOKEN : . ; // catch the unrecognized characters and redirect these errors to the parser
/*
@@ -214,6 +208,8 @@ mode DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE; // only used after a format specifi
* fragments
*/
+fragment IGNORE: 'ignore';
+
// https://docs.python.org/3.12/reference/lexical_analysis.html#literals
// https://docs.python.org/3.12/reference/lexical_analysis.html#string-and-bytes-literals
@@ -240,10 +236,10 @@ fragment SHORT_STRING_CHAR_NO_DOUBLE_QUOTE : ~[\\\r\n"]; //
-fragment STRING_ESCAPE_SEQ
- : '\\' OS_INDEPENDENT_NL // \ escape sequence
- | '\\' . // "\"
- ; // the \ (not \n) escape sequences will be removed from the string literals by the PythonLexerBase class
+fragment STRING_ESCAPE_SEQ // https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences
+ : '\\' '\r' '\n' // for the two-character Windows line break: \ escape sequence (string literal line continuation)
+ | '\\' . // "\"
+ ;
fragment BYTES_LITERAL : BYTES_PREFIX (SHORT_BYTES | LONG_BYTES);
fragment BYTES_PREFIX : 'b' | 'B' | 'br' | 'Br' | 'bR' | 'BR' | 'rb' | 'rB' | 'Rb' | 'RB';
@@ -315,9 +311,6 @@ fragment EXPONENT : ('e' | 'E') ('+' | '-')? DIGIT_PART;
// https://docs.python.org/3.12/reference/lexical_analysis.html#imaginary-literals
fragment IMAG_NUMBER : (FLOAT_NUMBER | DIGIT_PART) ('j' | 'J');
-// https://docs.python.org/3.12/reference/lexical_analysis.html#physical-lines
-fragment OS_INDEPENDENT_NL : '\r'? '\n'; // Unix, Windows
-
// https://github.com/RobEin/ANTLR4-parser-for-Python-3.12/tree/main/valid_chars_in_py_identifiers
fragment ID_CONTINUE:
ID_START
diff --git a/python/python3_12_1/PythonParser.g4 b/python/python3_12/PythonParser.g4
similarity index 99%
rename from python/python3_12_1/PythonParser.g4
rename to python/python3_12/PythonParser.g4
index c245eb5711..fc86feac7c 100644
--- a/python/python3_12_1/PythonParser.g4
+++ b/python/python3_12/PythonParser.g4
@@ -27,7 +27,7 @@ THE SOFTWARE.
*
*/
-parser grammar PythonParser; // Python 3.12.1 https://docs.python.org/3.12/reference/grammar.html#full-grammar-specification
+parser grammar PythonParser; // Python 3.12.6 https://docs.python.org/3.12/reference/grammar.html#full-grammar-specification
options {
tokenVocab=PythonLexer;
superClass=PythonParserBase;
@@ -475,8 +475,8 @@ type_param_seq: type_param (',' type_param)* ','?;
type_param
: NAME type_param_bound?
- | '*' NAME (':' expression)?
- | '**' NAME (':' expression)?
+ | '*' NAME
+ | '**' NAME
;
diff --git a/python/python3_12_1/README.md b/python/python3_12/README.md
similarity index 94%
rename from python/python3_12_1/README.md
rename to python/python3_12/README.md
index ca6cb76b5e..ffe8f60c50 100644
--- a/python/python3_12_1/README.md
+++ b/python/python3_12/README.md
@@ -1,4 +1,4 @@
-# Python 3.12.1 parser
+# Python 3.12.6 parser
### About files:
- PythonParser.g4
diff --git a/python/python3_12/TypeScript/PythonLexerBase.ts b/python/python3_12/TypeScript/PythonLexerBase.ts
new file mode 100644
index 0000000000..ce72f1782d
--- /dev/null
+++ b/python/python3_12/TypeScript/PythonLexerBase.ts
@@ -0,0 +1,392 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2021 Robert Einhorn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/*
+ *
+ * Project : Python Indent/Dedent handler for ANTLR4 grammars
+ *
+ * Developed by : Robert Einhorn, robert.einhorn.hu@gmail.com
+ *
+ */
+
+import { CharStream, Token, Lexer } from "antlr4";
+import PythonLexer from "./PythonLexer";
+import * as Collections from "typescript-collections";
+
+export default abstract class PythonLexerBase extends Lexer {
+ // A stack that keeps track of the indentation lengths
+ private indentLengthStack!: Collections.Stack;
+ // A list where tokens are waiting to be loaded into the token stream
+ private pendingTokens!: Array;
+
+ // last pending token types
+ private previousPendingTokenType!: number;
+ private lastPendingTokenTypeFromDefaultChannel!: number;
+
+ // The amount of opened parentheses, square brackets or curly braces
+ private opened!: number;
+ // The amount of opened parentheses and square brackets in the current lexer mode
+ private paren_or_bracket_openedStack!: Collections.Stack;
+
+ private wasSpaceIndentation!: boolean;
+ private wasTabIndentation!: boolean;
+ private wasIndentationMixedWithSpacesAndTabs!: boolean;
+
+ private curToken: Token | undefined; // current (under processing) token
+ private ffgToken: Token | undefined; // following (look ahead) token
+
+ private readonly INVALID_LENGTH: number = -1;
+ private readonly ERR_TXT: string = " ERROR: ";
+
+ protected constructor(input: CharStream) {
+ super(input);
+ this.init();
+ }
+
+ public nextToken(): Token { // reading the input stream until a return EOF
+ this.checkNextToken();
+ return this.pendingTokens.shift()! /* .pollFirst() */; // add the queued token to the token stream
+ }
+
+ public reset(): void {
+ this.init();
+ super.reset();
+ }
+
+ private init(): void {
+ this.indentLengthStack = new Collections.Stack();
+ this.pendingTokens = [];
+ this.previousPendingTokenType = 0;
+ this.lastPendingTokenTypeFromDefaultChannel = 0;
+ this.opened = 0;
+ this.paren_or_bracket_openedStack = new Collections.Stack();
+ this.wasSpaceIndentation = false;
+ this.wasTabIndentation = false;
+ this.wasIndentationMixedWithSpacesAndTabs = false;
+ this.curToken = undefined;
+ this.ffgToken = undefined;
+ }
+
+ private checkNextToken(): void {
+ if (this.previousPendingTokenType !== PythonLexer.EOF) {
+ this.setCurrentAndFollowingTokens();
+ if (this.indentLengthStack.isEmpty()) { // We're at the first token
+ this.handleStartOfInput();
+ }
+
+ switch (this.curToken!.type) {
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ case PythonLexer.LBRACE:
+ this.opened++;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ case PythonLexer.RBRACE:
+ this.opened--;
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.NEWLINE:
+ this.handleNEWLINEtoken();
+ break;
+ case PythonLexer.FSTRING_MIDDLE:
+ this.handleFSTRING_MIDDLE_token();
+ break;
+ case PythonLexer.ERRORTOKEN:
+ this.reportLexerError(`token recognition error at: '${this.curToken!.text}'`);
+ this.addPendingToken(this.curToken!);
+ break;
+ case PythonLexer.EOF:
+ this.handleEOFtoken();
+ break;
+ default:
+ this.addPendingToken(this.curToken!);
+ }
+ this.handleFORMAT_SPECIFICATION_MODE();
+ }
+ }
+
+ private setCurrentAndFollowingTokens(): void {
+ this.curToken = this.ffgToken == undefined
+ ? super.nextToken()
+ : this.ffgToken;
+
+ this.handleFStringLexerModes();
+
+ this.ffgToken = this.curToken.type === PythonLexer.EOF
+ ? this.curToken
+ : super.nextToken();
+ }
+
+ // initialize the indentLengthStack
+ // hide the leading NEWLINE token(s)
+ // if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
+ // insert a leading INDENT token if necessary
+ private handleStartOfInput(): void {
+ // initialize the stack with a default 0 indentation length
+ this.indentLengthStack.push(0); // this will never be popped off
+ while (this.curToken!.type !== PythonLexer.EOF) {
+ if (this.curToken!.channel === Token.DEFAULT_CHANNEL) {
+ if (this.curToken!.type === PythonLexer.NEWLINE) {
+ // all the NEWLINE tokens must be ignored before the first statement
+ this.hideAndAddPendingToken(this.curToken!);
+ } else { // We're at the first statement
+ this.insertLeadingIndentToken();
+ return; // continue the processing of the current token with checkNextToken()
+ }
+ } else {
+ this.addPendingToken(this.curToken!); // it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
+ }
+ this.setCurrentAndFollowingTokens();
+ } // continue the processing of the EOF token with checkNextToken()
+ }
+
+ private insertLeadingIndentToken(): void {
+ if (this.previousPendingTokenType === PythonLexer.WS) {
+ const prevToken: Token = this.pendingTokens[this.pendingTokens.length - 1] /* .peekLast() */; // WS token
+ if (this.getIndentationLength(prevToken.text) !== 0) { // there is an "indentation" before the first statement
+ const errMsg: string = "first statement indented";
+ this.reportLexerError(errMsg);
+ // insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.curToken!);
+ }
+ }
+ }
+
+ private handleNEWLINEtoken(): void {
+ if (this.opened > 0) { // We're in an implicit line joining, ignore the current NEWLINE token
+ this.hideAndAddPendingToken(this.curToken!);
+ } else {
+ const nlToken: Token = this.curToken?.clone()!; // save the current NEWLINE token
+ const isLookingAhead: boolean = this.ffgToken!.type === PythonLexer.WS;
+ if (isLookingAhead) {
+ this.setCurrentAndFollowingTokens(); // set the next two tokens
+ }
+
+ switch (this.ffgToken!.type) {
+ case PythonLexer.NEWLINE: // We're before a blank line
+ case PythonLexer.COMMENT: // We're before a comment
+ this.hideAndAddPendingToken(nlToken);
+ if (isLookingAhead) {
+ this.addPendingToken(this.curToken!); // WS token
+ }
+ break;
+ default:
+ this.addPendingToken(nlToken);
+ if (isLookingAhead) { // We're on whitespace(s) followed by a statement
+ const indentationLength: number = this.ffgToken!.type === PythonLexer.EOF ?
+ 0 :
+ this.getIndentationLength(this.curToken!.text);
+
+ if (indentationLength !== this.INVALID_LENGTH) {
+ this.addPendingToken(this.curToken!); // WS token
+ this.insertIndentOrDedentToken(indentationLength); // may insert INDENT token or DEDENT token(s)
+ } else {
+ this.reportError("inconsistent use of tabs and spaces in indentation");
+ }
+ } else { // We're at a newline followed by a statement (there is no whitespace before the statement)
+ this.insertIndentOrDedentToken(0); // may insert DEDENT token(s)
+ }
+ }
+ }
+ }
+
+ private insertIndentOrDedentToken(indentLength: number): void {
+ let prevIndentLength: number = this.indentLengthStack.peek()!;
+ if (indentLength > prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.INDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
+ this.indentLengthStack.push(indentLength);
+ } else {
+ while (indentLength < prevIndentLength) { // more than 1 DEDENT token may be inserted to the token stream
+ this.indentLengthStack.pop();
+ prevIndentLength = this.indentLengthStack.peek()!;
+ if (indentLength <= prevIndentLength) {
+ this.createAndAddPendingToken(PythonLexer.DEDENT, Token.DEFAULT_CHANNEL, null, this.ffgToken!);
+ } else {
+ this.reportError("inconsistent dedent");
+ }
+ }
+ }
+ }
+
+ private handleFSTRING_MIDDLE_token(): void { // replace the double braces '{{' or '}}' to single braces and hide the second braces
+ let fsMid: string = this.curToken!.text;
+ fsMid = fsMid.replace(/\{\{/g, "{_").replace(/\}\}/g, "}_"); // replace: {{ --> {_ and }} --> }_
+ const arrOfStr: string[] = fsMid.split(/(?<=[{}])_/); // split by {_ or }_
+ for (let s of arrOfStr) {
+ if (s) {
+ this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, this.ffgToken!);
+ const lastCharacter: string = s.charAt(s.length - 1);
+ if ("{}".includes(lastCharacter)) {
+ this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, this.ffgToken!);
+ // this inserted hidden token allows to restore the original f-string literal with the double braces
+ }
+ }
+ }
+ }
+
+ private handleFStringLexerModes(): void { // https://peps.python.org/pep-0498/#specification
+ if (this.getModeStack().length > 0) {
+ switch (this.curToken!.type) {
+ case PythonLexer.LBRACE:
+ this.pushMode(Lexer.DEFAULT_MODE);
+ this.paren_or_bracket_openedStack.push(0);
+ break;
+ case PythonLexer.LPAR:
+ case PythonLexer.LSQB:
+ // https://peps.python.org/pep-0498/#lambdas-inside-expressions
+ this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop()! + 1); // increment the last element
+ break;
+ case PythonLexer.RPAR:
+ case PythonLexer.RSQB:
+ this.paren_or_bracket_openedStack.push(this.paren_or_bracket_openedStack.pop()! - 1); // decrement the last element
+ break;
+ case PythonLexer.COLON: // colon can only come from DEFAULT_MODE
+ if (this.paren_or_bracket_openedStack.peek() == 0) {
+ switch (this.getModeStack().at(-1) /* peek() */) { // check the previous lexer mode (the current is DEFAULT_MODE)
+ case PythonLexer.SINGLE_QUOTE_FSTRING_MODE:
+ case PythonLexer.LONG_SINGLE_QUOTE_FSTRING_MODE:
+ case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+ this.setMode(PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ case PythonLexer.DOUBLE_QUOTE_FSTRING_MODE:
+ case PythonLexer.LONG_DOUBLE_QUOTE_FSTRING_MODE:
+ case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+ this.setMode(PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE); // continue in format spec. mode
+ break;
+ }
+ }
+ break;
+ case PythonLexer.RBRACE:
+ switch (this.getMode()) {
+ case Lexer.DEFAULT_MODE:
+ case PythonLexer.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+ case PythonLexer.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
+ this.popMode();
+ this.paren_or_bracket_openedStack.pop();
+ break;
+ default:
+ this.reportLexerError("f-string: single '}' is not allowed");
+ break;
+ }
+ break;
+ }
+ }
+ }
+
+ private handleFORMAT_SPECIFICATION_MODE(): void {
+ if (this.getModeStack().length > 0 && this.ffgToken!.type === PythonLexer.RBRACE) {
+ switch (this.curToken!.type) {
+ case PythonLexer.COLON:
+ case PythonLexer.RBRACE:
+ // insert an empty FSTRING_MIDDLE token instead of the missing format specification
+ this.createAndAddPendingToken(PythonLexer.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", this.ffgToken!);
+ break;
+ }
+ }
+ }
+
+ private insertTrailingTokens(): void {
+ switch (this.lastPendingTokenTypeFromDefaultChannel) {
+ case PythonLexer.NEWLINE:
+ case PythonLexer.DEDENT:
+ break; // no trailing NEWLINE token is needed
+ default:
+ // insert an extra trailing NEWLINE token that serves as the end of the last statement
+ this.createAndAddPendingToken(PythonLexer.NEWLINE, Token.DEFAULT_CHANNEL, null, this.ffgToken!); // ffgToken is EOF
+ }
+ this.insertIndentOrDedentToken(0); // Now insert as much trailing DEDENT tokens as needed
+ }
+
+ private handleEOFtoken(): void {
+ if (this.lastPendingTokenTypeFromDefaultChannel > 0) {
+ // there was a statement in the input (leading NEWLINE tokens are hidden)
+ this.insertTrailingTokens();
+ }
+ this.addPendingToken(this.curToken!);
+ }
+
+ private hideAndAddPendingToken(tkn: Token): void {
+ tkn.channel = Token.HIDDEN_CHANNEL;
+ this.addPendingToken(tkn);
+ }
+
+ private createAndAddPendingToken(type: number, channel: number, text: string | null, sampleToken: Token): void {
+ const tkn: Token = sampleToken.clone();
+ tkn.type = type;
+ tkn.channel = channel;
+ tkn.stop = sampleToken.start - 1;
+ tkn.text = text == null ?
+ `<${this.getSymbolicNames()[type]}>` :
+ text;
+
+ this.addPendingToken(tkn);
+ }
+
+ private addPendingToken(tkn: Token): void {
+ // save the last pending token type because the pendingTokens linked list can be empty by the nextToken()
+ this.previousPendingTokenType = tkn.type;
+ if (tkn.channel === Token.DEFAULT_CHANNEL) {
+ this.lastPendingTokenTypeFromDefaultChannel = this.previousPendingTokenType;
+ }
+ this.pendingTokens.push(tkn) /* .addLast(token) */;
+ }
+
+ private getIndentationLength(indentText: string): number { // the indentText may contain spaces, tabs or form feeds
+ const TAB_LENGTH: number = 8; // the standard number of spaces to replace a tab to spaces
+ let length: number = 0;
+ for (let ch of indentText) {
+ switch (ch) {
+ case " ":
+ this.wasSpaceIndentation = true;
+ length += 1;
+ break;
+ case "\t":
+ this.wasTabIndentation = true;
+ length += TAB_LENGTH - (length % TAB_LENGTH);
+ break;
+ case "\f": // form feed
+ length = 0;
+ break;
+ }
+ }
+
+ if (this.wasTabIndentation && this.wasSpaceIndentation) {
+ if (!this.wasIndentationMixedWithSpacesAndTabs) {
+ this.wasIndentationMixedWithSpacesAndTabs = true;
+ length = this.INVALID_LENGTH; // only for the first inconsistent indent
+ }
+ }
+ return length;
+ }
+
+ private reportLexerError(errMsg: string): void {
+ this.getErrorListener().syntaxError(this, 0 /* this.curToken */, this.curToken!.line, this.curToken!.column, " LEXER" + this.ERR_TXT + errMsg, undefined);
+ }
+
+ private reportError(errMsg: string): void {
+ this.reportLexerError(errMsg);
+
+ // the ERRORTOKEN will raise an error in the parser
+ this.createAndAddPendingToken(PythonLexer.ERRORTOKEN, Token.DEFAULT_CHANNEL, this.ERR_TXT + errMsg, this.ffgToken!);
+ }
+}
diff --git a/python/python3_12/TypeScript/PythonParserBase.ts b/python/python3_12/TypeScript/PythonParserBase.ts
new file mode 100644
index 0000000000..a21b3117af
--- /dev/null
+++ b/python/python3_12/TypeScript/PythonParserBase.ts
@@ -0,0 +1,16 @@
+import { Parser, TokenStream } from "antlr4";
+//import antlr4 from "antlr4";
+
+export default class PythonParserBase extends Parser {
+ constructor(input: TokenStream) {
+ super(input);
+ }
+
+ isEqualToCurrentTokenText(tokenText: string): boolean {
+ return this.getCurrentToken().text === tokenText;
+ }
+
+ isnotEqualToCurrentTokenText(tokenText: string): boolean {
+ return !this.isEqualToCurrentTokenText(tokenText); // for compatibility with the Python 'not' logical operator
+ }
+}
diff --git a/python/python3_12/changes.txt b/python/python3_12/changes.txt
new file mode 100644
index 0000000000..b9f4d706f5
--- /dev/null
+++ b/python/python3_12/changes.txt
@@ -0,0 +1,7 @@
+Szept 05, 2024
+--------------
+Type comment tokens are no longer generated.
+Type comments will now be tokenized as plain comment tokens.
+
+Line continuation for string literals (backslash followed by a newline) is no longer resolved.
+(backslash+newline is no longer removed from string literals)
diff --git a/python/python3_12_1/desc.xml b/python/python3_12/desc.xml
similarity index 59%
rename from python/python3_12_1/desc.xml
rename to python/python3_12/desc.xml
index f6ddd173f0..8aa6fdea92 100644
--- a/python/python3_12_1/desc.xml
+++ b/python/python3_12/desc.xml
@@ -1,9 +1,9 @@
- ^4.13.1
- CSharp;Java;Python3;JavaScript
+ ^4.13.2
+ CSharp;Java;Python3;JavaScript;TypeScript
- CSharp;Java;Python3;JavaScript
+ CSharp;Java;Python3;JavaScript;TypeScript
file_input
examples
diff --git a/python/python3_12_1/examples/__future__.py b/python/python3_12/examples/__future__.py
similarity index 100%
rename from python/python3_12_1/examples/__future__.py
rename to python/python3_12/examples/__future__.py
diff --git a/python/python3_12_1/examples/__hello__.py b/python/python3_12/examples/__hello__.py
similarity index 100%
rename from python/python3_12_1/examples/__hello__.py
rename to python/python3_12/examples/__hello__.py
diff --git a/python/python3_12_1/examples/_aix_support.py b/python/python3_12/examples/_aix_support.py
similarity index 100%
rename from python/python3_12_1/examples/_aix_support.py
rename to python/python3_12/examples/_aix_support.py
diff --git a/python/python3_12_1/examples/_collections_abc.py b/python/python3_12/examples/_collections_abc.py
similarity index 100%
rename from python/python3_12_1/examples/_collections_abc.py
rename to python/python3_12/examples/_collections_abc.py
diff --git a/python/python3_12_1/examples/_compat_pickle.py b/python/python3_12/examples/_compat_pickle.py
similarity index 100%
rename from python/python3_12_1/examples/_compat_pickle.py
rename to python/python3_12/examples/_compat_pickle.py
diff --git a/python/python3_12_1/examples/_compression.py b/python/python3_12/examples/_compression.py
similarity index 100%
rename from python/python3_12_1/examples/_compression.py
rename to python/python3_12/examples/_compression.py
diff --git a/python/python3_12_1/examples/_markupbase.py b/python/python3_12/examples/_markupbase.py
similarity index 100%
rename from python/python3_12_1/examples/_markupbase.py
rename to python/python3_12/examples/_markupbase.py
diff --git a/python/python3_12_1/examples/_osx_support.py b/python/python3_12/examples/_osx_support.py
similarity index 100%
rename from python/python3_12_1/examples/_osx_support.py
rename to python/python3_12/examples/_osx_support.py
diff --git a/python/python3_12_1/examples/_py_abc.py b/python/python3_12/examples/_py_abc.py
similarity index 100%
rename from python/python3_12_1/examples/_py_abc.py
rename to python/python3_12/examples/_py_abc.py
diff --git a/python/python3_12_1/examples/_pydatetime.py b/python/python3_12/examples/_pydatetime.py
similarity index 100%
rename from python/python3_12_1/examples/_pydatetime.py
rename to python/python3_12/examples/_pydatetime.py
diff --git a/python/python3_12_1/examples/_pydecimal.py b/python/python3_12/examples/_pydecimal.py
similarity index 100%
rename from python/python3_12_1/examples/_pydecimal.py
rename to python/python3_12/examples/_pydecimal.py
diff --git a/python/python3_12_1/examples/_pyio.py b/python/python3_12/examples/_pyio.py
similarity index 100%
rename from python/python3_12_1/examples/_pyio.py
rename to python/python3_12/examples/_pyio.py
diff --git a/python/python3_12_1/examples/_pylong.py b/python/python3_12/examples/_pylong.py
similarity index 100%
rename from python/python3_12_1/examples/_pylong.py
rename to python/python3_12/examples/_pylong.py
diff --git a/python/python3_12_1/examples/_sitebuiltins.py b/python/python3_12/examples/_sitebuiltins.py
similarity index 100%
rename from python/python3_12_1/examples/_sitebuiltins.py
rename to python/python3_12/examples/_sitebuiltins.py
diff --git a/python/python3_12_1/examples/_strptime.py b/python/python3_12/examples/_strptime.py
similarity index 100%
rename from python/python3_12_1/examples/_strptime.py
rename to python/python3_12/examples/_strptime.py
diff --git a/python/python3_12_1/examples/_threading_local.py b/python/python3_12/examples/_threading_local.py
similarity index 100%
rename from python/python3_12_1/examples/_threading_local.py
rename to python/python3_12/examples/_threading_local.py
diff --git a/python/python3_12_1/examples/_weakrefset.py b/python/python3_12/examples/_weakrefset.py
similarity index 100%
rename from python/python3_12_1/examples/_weakrefset.py
rename to python/python3_12/examples/_weakrefset.py
diff --git a/python/python3_12_1/examples/abc.py b/python/python3_12/examples/abc.py
similarity index 100%
rename from python/python3_12_1/examples/abc.py
rename to python/python3_12/examples/abc.py
diff --git a/python/python3_12_1/examples/aifc.py b/python/python3_12/examples/aifc.py
similarity index 100%
rename from python/python3_12_1/examples/aifc.py
rename to python/python3_12/examples/aifc.py
diff --git a/python/python3_12_1/examples/antigravity.py b/python/python3_12/examples/antigravity.py
similarity index 100%
rename from python/python3_12_1/examples/antigravity.py
rename to python/python3_12/examples/antigravity.py
diff --git a/python/python3_12_1/examples/argparse.py b/python/python3_12/examples/argparse.py
similarity index 100%
rename from python/python3_12_1/examples/argparse.py
rename to python/python3_12/examples/argparse.py
diff --git a/python/python3_12_1/examples/ast.py b/python/python3_12/examples/ast.py
similarity index 100%
rename from python/python3_12_1/examples/ast.py
rename to python/python3_12/examples/ast.py
diff --git a/python/python3_12_1/examples/base64.py b/python/python3_12/examples/base64.py
similarity index 100%
rename from python/python3_12_1/examples/base64.py
rename to python/python3_12/examples/base64.py
diff --git a/python/python3_12_1/examples/bdb.py b/python/python3_12/examples/bdb.py
similarity index 100%
rename from python/python3_12_1/examples/bdb.py
rename to python/python3_12/examples/bdb.py
diff --git a/python/python3_12_1/examples/bisect.py b/python/python3_12/examples/bisect.py
similarity index 100%
rename from python/python3_12_1/examples/bisect.py
rename to python/python3_12/examples/bisect.py
diff --git a/python/python3_12_1/examples/bz2.py b/python/python3_12/examples/bz2.py
similarity index 100%
rename from python/python3_12_1/examples/bz2.py
rename to python/python3_12/examples/bz2.py
diff --git a/python/python3_12_1/examples/calendar.py b/python/python3_12/examples/calendar.py
similarity index 100%
rename from python/python3_12_1/examples/calendar.py
rename to python/python3_12/examples/calendar.py
diff --git a/python/python3_12_1/examples/cgi.py b/python/python3_12/examples/cgi.py
similarity index 100%
rename from python/python3_12_1/examples/cgi.py
rename to python/python3_12/examples/cgi.py
diff --git a/python/python3_12_1/examples/cgitb.py b/python/python3_12/examples/cgitb.py
similarity index 100%
rename from python/python3_12_1/examples/cgitb.py
rename to python/python3_12/examples/cgitb.py
diff --git a/python/python3_12_1/examples/chunk.py b/python/python3_12/examples/chunk.py
similarity index 100%
rename from python/python3_12_1/examples/chunk.py
rename to python/python3_12/examples/chunk.py
diff --git a/python/python3_12_1/pom.xml b/python/python3_12/pom.xml
similarity index 100%
rename from python/python3_12_1/pom.xml
rename to python/python3_12/pom.xml
diff --git a/python/python3_12_1/tests/test_empty_file.py b/python/python3_12/tests/test_empty_file.py
similarity index 100%
rename from python/python3_12_1/tests/test_empty_file.py
rename to python/python3_12/tests/test_empty_file.py
diff --git a/python/python3_12_1/tests/test_error_first_statement_indented.py b/python/python3_12/tests/test_error_first_statement_indented.py
similarity index 76%
rename from python/python3_12_1/tests/test_error_first_statement_indented.py
rename to python/python3_12/tests/test_error_first_statement_indented.py
index dc70cc8572..39431ac786 100644
--- a/python/python3_12_1/tests/test_error_first_statement_indented.py
+++ b/python/python3_12/tests/test_error_first_statement_indented.py
@@ -4,7 +4,7 @@
# EXPECTATIONS:
# - inserted leading INDENT token
# - hidden NEWLINE tokens (channel=1) before the first statement
-# - lexer error message: "line 10:3 first statement indented"
+# - lexer error message: "line 10:3 LEXER ERROR: first statement indented"
i = 1 # first statement begins with space
diff --git a/python/python3_12_1/tests/test_error_inconsistent_dedent.py b/python/python3_12/tests/test_error_inconsistent_dedent.py
similarity index 73%
rename from python/python3_12_1/tests/test_error_inconsistent_dedent.py
rename to python/python3_12/tests/test_error_inconsistent_dedent.py
index 0a74fde76a..660f59ff65 100644
--- a/python/python3_12_1/tests/test_error_inconsistent_dedent.py
+++ b/python/python3_12/tests/test_error_inconsistent_dedent.py
@@ -3,7 +3,7 @@
#
# EXPECTATIONS:
# - inserted ERROR_TOKEN instead of the DEDENT token
-# - lexer error message: "line 10:0 inconsistent dedent"
+# - lexer error message: "line 10:0 LEXER ERROR: inconsistent dedent"
if True:
i = 0
diff --git a/python/python3_12_1/tests/test_error_not_indented.py b/python/python3_12/tests/test_error_not_indented.py
similarity index 100%
rename from python/python3_12_1/tests/test_error_not_indented.py
rename to python/python3_12/tests/test_error_not_indented.py
diff --git a/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py b/python/python3_12/tests/test_error_tab_and_space_in_indentation.py
similarity index 69%
rename from python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py
rename to python/python3_12/tests/test_error_tab_and_space_in_indentation.py
index 493933be68..7d77a9bc0e 100644
--- a/python/python3_12_1/tests/test_error_tab_and_space_in_indentation.py
+++ b/python/python3_12/tests/test_error_tab_and_space_in_indentation.py
@@ -3,7 +3,7 @@
#
# EXPECTATIONS:
# - inserted ERROR_TOKEN instead of the WS token
-# - lexer error message: "line 11:0 inconsistent use of tabs and spaces in indentation"
+# - lexer error message: "line 11:0 LEXER ERROR: inconsistent use of tabs and spaces in indentation"
if True:
i = 0 # indented by spaces
diff --git a/python/python3_12_1/tests/test_error_unexpected_indent.py b/python/python3_12/tests/test_error_unexpected_indent.py
similarity index 71%
rename from python/python3_12_1/tests/test_error_unexpected_indent.py
rename to python/python3_12/tests/test_error_unexpected_indent.py
index 9d6bbd3f1f..9fca02bf5d 100644
--- a/python/python3_12_1/tests/test_error_unexpected_indent.py
+++ b/python/python3_12/tests/test_error_unexpected_indent.py
@@ -2,7 +2,7 @@
# grun Python file_input -tokens test_error_unexpected_indent.py
#
# EXPECTATION:
-# - parser error message: "line 9:7 extraneous input '' ..."
+# - parser error message: "line 9:7 mismatched input '' ..."
if True:
i = 0
diff --git a/python/python3_12_1/tests/test_explicit_line_joining.py b/python/python3_12/tests/test_explicit_line_joining.py
similarity index 72%
rename from python/python3_12_1/tests/test_explicit_line_joining.py
rename to python/python3_12/tests/test_explicit_line_joining.py
index 011ee61e4b..55be1bd964 100644
--- a/python/python3_12_1/tests/test_explicit_line_joining.py
+++ b/python/python3_12/tests/test_explicit_line_joining.py
@@ -2,7 +2,7 @@
# grun Python file_input -tokens test_explicit_line_joining.py
#
# EXPECTATIONS:
-# - hiden (channel=1) LINE_JOINING token
+# - hiden (channel=1) EXPLICIT_LINE_JOINING token
# - no error message
i = 1 \
diff --git a/python/python3_12_1/tests/test_formfeed_as_separator.py b/python/python3_12/tests/test_formfeed_as_separator.py
similarity index 100%
rename from python/python3_12_1/tests/test_formfeed_as_separator.py
rename to python/python3_12/tests/test_formfeed_as_separator.py
diff --git a/python/python3_12_1/tests/test_formfeed_at_start_of_line.py b/python/python3_12/tests/test_formfeed_at_start_of_line.py
similarity index 100%
rename from python/python3_12_1/tests/test_formfeed_at_start_of_line.py
rename to python/python3_12/tests/test_formfeed_at_start_of_line.py
diff --git a/python/python3_12_1/tests/test_formfeed_in_indent.py b/python/python3_12/tests/test_formfeed_in_indent.py
similarity index 100%
rename from python/python3_12_1/tests/test_formfeed_in_indent.py
rename to python/python3_12/tests/test_formfeed_in_indent.py
diff --git a/python/python3_12_1/tests/test_hidden_NEWLINE_before_blank_line.py b/python/python3_12/tests/test_hidden_NEWLINE_before_blank_line.py
similarity index 100%
rename from python/python3_12_1/tests/test_hidden_NEWLINE_before_blank_line.py
rename to python/python3_12/tests/test_hidden_NEWLINE_before_blank_line.py
diff --git a/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py b/python/python3_12/tests/test_hidden_NEWLINE_before_comment.py
similarity index 76%
rename from python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py
rename to python/python3_12/tests/test_hidden_NEWLINE_before_comment.py
index d080bc16fb..9db3798954 100644
--- a/python/python3_12_1/tests/test_hidden_NEWLINE_before_comment.py
+++ b/python/python3_12/tests/test_hidden_NEWLINE_before_comment.py
@@ -6,6 +6,6 @@ def inc(value):
# grun Python file_input -tokens test_hidden_NEWLINE_before_comment.py
#
# EXPECTATIONS:
-# - hidden NEWLINE tokens (channel=1) before a COMMENT (or a TYPE_COMMENT) token
+# - hidden NEWLINE tokens (channel=1) before a COMMENT token
# - hidden NEWLINE token (channel=1) before the blank line
# - no error message
diff --git a/python/python3_12_1/tests/test_hidden_leading_NEWLINEs.py b/python/python3_12/tests/test_hidden_leading_NEWLINEs.py
similarity index 100%
rename from python/python3_12_1/tests/test_hidden_leading_NEWLINEs.py
rename to python/python3_12/tests/test_hidden_leading_NEWLINEs.py
diff --git a/python/python3_12_1/tests/test_implicit_line_joining.py b/python/python3_12/tests/test_implicit_line_joining.py
similarity index 100%
rename from python/python3_12_1/tests/test_implicit_line_joining.py
rename to python/python3_12/tests/test_implicit_line_joining.py
diff --git a/python/python3_12_1/tests/test_insert_trailing_NEWLINE_1.py b/python/python3_12/tests/test_insert_trailing_NEWLINE_1.py
similarity index 100%
rename from python/python3_12_1/tests/test_insert_trailing_NEWLINE_1.py
rename to python/python3_12/tests/test_insert_trailing_NEWLINE_1.py
diff --git a/python/python3_12_1/tests/test_insert_trailing_NEWLINE_2.py b/python/python3_12/tests/test_insert_trailing_NEWLINE_2.py
similarity index 100%
rename from python/python3_12_1/tests/test_insert_trailing_NEWLINE_2.py
rename to python/python3_12/tests/test_insert_trailing_NEWLINE_2.py
diff --git a/python/python3_12_1/tests/test_no_trailing_NEWLINE.py b/python/python3_12/tests/test_no_trailing_NEWLINE.py
similarity index 100%
rename from python/python3_12_1/tests/test_no_trailing_NEWLINE.py
rename to python/python3_12/tests/test_no_trailing_NEWLINE.py
diff --git a/python/python3_12_1/tests/test_trailing_inconsistent_dedent.py b/python/python3_12/tests/test_trailing_inconsistent_dedent.py
similarity index 100%
rename from python/python3_12_1/tests/test_trailing_inconsistent_dedent.py
rename to python/python3_12/tests/test_trailing_inconsistent_dedent.py
diff --git a/python/python3_12_1/tests/test_trailing_indent.py b/python/python3_12/tests/test_trailing_indent.py
similarity index 100%
rename from python/python3_12_1/tests/test_trailing_indent.py
rename to python/python3_12/tests/test_trailing_indent.py
diff --git a/python/python3_12_1/tests/test_trailing_unexpected_indent.py b/python/python3_12/tests/test_trailing_unexpected_indent.py
similarity index 100%
rename from python/python3_12_1/tests/test_trailing_unexpected_indent.py
rename to python/python3_12/tests/test_trailing_unexpected_indent.py
diff --git a/python/python3_12_1/Python3/PythonLexerBase.py b/python/python3_12_1/Python3/PythonLexerBase.py
deleted file mode 100644
index 1926ab1ffc..0000000000
--- a/python/python3_12_1/Python3/PythonLexerBase.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# The MIT License (MIT)
-# Copyright (c) 2021 Robert Einhorn
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# Project : Python Indent/Dedent handler for ANTLR4 grammars
-#
-# Developed by : Robert Einhorn
-
-from collections import deque
-from typing import TextIO
-from antlr4 import InputStream, Lexer, Token
-from antlr4.Token import CommonToken
-import sys
-import re
-
-class PythonLexerBase(Lexer):
- def __init__(self, input: InputStream, output: TextIO = sys.stdout):
- super().__init__(input, output)
-
- # A stack that keeps track of the indentation lengths
- self.indent_length_stack: Deque[int]
-
- # A list where tokens are waiting to be loaded into the token stream
- self.pending_tokens: list[CommonToken]
-
- # last pending token types
- self.previous_pending_token_type: int
- self.last_pending_token_type_from_default_channel: int
-
- # The amount of opened parentheses, square brackets or curly braces
- self.opened: int
- # The amount of opened parentheses and square brackets in the current lexer mode
- self.paren_or_bracket_opened_stack: Deque[int]
-
- self.was_space_indentation: bool
- self.was_tab_indentation: bool
- self.was_indentation_mixed_with_spaces_and_tabs: bool
- self.INVALID_LENGTH: int
-
- self.cur_token: CommonToken # current (under processing) token
- self.ffg_token: CommonToken # following (look ahead) token
-
- self.ERR_TXT: str
-
- self.init()
-
- def init(self):
- self.indent_length_stack = deque()
- self.pending_tokens = []
- self.previous_pending_token_type = 0
- self.last_pending_token_type_from_default_channel = 0
- self.opened = 0
- self.paren_or_bracket_opened_stack = deque()
- self.was_space_indentation = False
- self.was_tab_indentation = False
- self.was_indentation_mixed_with_spaces_and_tabs = False
- self.INVALID_LENGTH = -1
- self.cur_token = None
- self.ffg_token = None
- self.ERR_TXT = " ERROR: "
-
- def nextToken(self) -> CommonToken: # reading the input stream until a return EOF
- self.check_next_token()
- return self.pending_tokens.pop(0) # add the queued token to the token stream
-
- def check_next_token(self):
- if self.previous_pending_token_type != Token.EOF:
- self.set_current_and_following_tokens()
- if len(self.indent_length_stack) == 0: # We're at the first token
- self.handle_start_of_input()
- match self.cur_token.type:
- case self.LPAR | self.LSQB | self.LBRACE:
- self.opened += 1
- self.add_pending_token(self.cur_token)
- case self.RPAR | self.RSQB | self.RBRACE:
- self.opened -= 1
- self.add_pending_token(self.cur_token)
- case self.NEWLINE:
- self.handle_NEWLINE_token()
- case self.STRING:
- self.handle_STRING_token()
- case self.FSTRING_MIDDLE:
- self.handle_FSTRING_MIDDLE_token()
- case self.ERROR_TOKEN:
- self.report_lexer_error("token recognition error at: '" + self.cur_token.text + "'")
- self.add_pending_token(self.cur_token)
- case Token.EOF:
- self.handle_EOF_token()
- case other:
- self.add_pending_token(self.cur_token)
- self.handle_FORMAT_SPECIFICATION_MODE()
-
- def set_current_and_following_tokens(self):
- self.cur_token = super().nextToken() if self.ffg_token is None else \
- self.ffg_token
-
- self.handle_fstring_lexer_modes()
-
- self.ffg_token = self.cur_token if self.cur_token.type == Token.EOF else \
- super().nextToken()
-
- # initialize the _indent_length_stack
- # hide the leading NEWLINE token(s)
- # if exists, find the first statement (not NEWLINE, not EOF token) that comes from the default channel
- # insert a leading INDENT token if necessary
- def handle_start_of_input(self):
- # initialize the stack with a default 0 indentation length
- self.indent_length_stack.append(0) # this will never be popped off
- while self.cur_token.type != Token.EOF:
- if self.cur_token.channel == Token.DEFAULT_CHANNEL:
- if self.cur_token.type == self.NEWLINE:
- # all the NEWLINE tokens must be ignored before the first statement
- self.hide_and_add_pending_token(self.cur_token)
- else: # We're at the first statement
- self.insert_leading_indent_token()
- return # continue the processing of the current token with check_next_token()
- else:
- self.add_pending_token(self.cur_token) # it can be WS, EXPLICIT_LINE_JOINING or COMMENT token
- self.set_current_and_following_tokens()
- # continue the processing of the EOF token with check_next_token()
-
- def insert_leading_indent_token(self):
- if self.previous_pending_token_type == self.WS:
- prev_token: CommonToken = self.pending_tokens[-1] # WS token
- if self.get_indentation_length(prev_token.text) != 0: # there is an "indentation" before the first statement
- err_msg: str = "first statement indented"
- self.report_lexer_error(err_msg)
- # insert an INDENT token before the first statement to raise an 'unexpected indent' error later by the parser
- self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.cur_token)
-
- def handle_NEWLINE_token(self):
- if self.opened > 0: # We're in an implicit line joining, ignore the current NEWLINE token
- self.hide_and_add_pending_token(self.cur_token)
- else:
- nl_token: CommonToken = self.cur_token # save the current NEWLINE token
- is_looking_ahead: bool = self.ffg_token.type == self.WS
- if is_looking_ahead:
- self.set_current_and_following_tokens() # set the next two tokens
-
- match self.ffg_token.type:
- case self.NEWLINE | self.COMMENT | self.TYPE_COMMENT:
- # We're before a blank line or a comment or a type comment
- self.hide_and_add_pending_token(nl_token) # ignore the NEWLINE token
- if is_looking_ahead:
- self.add_pending_token(self.cur_token) # WS token
- case other:
- self.add_pending_token(nl_token)
- if is_looking_ahead: # We're on a whitespace(s) followed by a statement
- indentation_length: int = 0 if self.ffg_token.type == Token.EOF else \
- self.get_indentation_length(self.cur_token.text)
-
- if indentation_length != self.INVALID_LENGTH:
- self.add_pending_token(self.cur_token) # WS token
- self.insert_indent_or_dedent_token(indentation_length) # may insert INDENT token or DEDENT token(s)
- else:
- self.report_error("inconsistent use of tabs and spaces in indentation")
- else: # We're at a newline followed by a statement (there is no whitespace before the statement)
- self.insert_indent_or_dedent_token(0) # may insert DEDENT token(s)
-
- def insert_indent_or_dedent_token(self, indent_length: int):
- prev_indent_length: int = self.indent_length_stack[-1] # peek()
- if indent_length > prev_indent_length:
- self.create_and_add_pending_token(self.INDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token)
- self.indent_length_stack.append(indent_length)
- else:
- while indent_length < prev_indent_length: # more than 1 DEDENT token may be inserted to the token stream
- self.indent_length_stack.pop()
- prev_indent_length = self.indent_length_stack[-1] # peek()
- if indent_length <= prev_indent_length:
- self.create_and_add_pending_token(self.DEDENT, Token.DEFAULT_CHANNEL, None, self.ffg_token)
- else:
- self.report_error("inconsistent dedent")
-
- def handle_STRING_token(self): # remove the \ escape sequences from the string literal
- # https://docs.python.org/3.11/reference/lexical_analysis.html#string-and-bytes-literals
- line_joinFreeStringLiteral: str = re.sub(r"\\\r?\n", "", self.cur_token.text)
- if len(self.cur_token.text) == len(line_joinFreeStringLiteral):
- self.add_pending_token(self.cur_token)
- else:
- originalSTRINGtoken: CommonToken = self.cur_token.clone() # backup the original token
- self.cur_token.text = line_joinFreeStringLiteral
- self.add_pending_token(self.cur_token) # add the modified token with inline string literal
- self.hide_and_add_pending_token(originalSTRINGtoken) # add the original token to the hidden channel
- # this inserted hidden token allows to restore the original string literal with the \ escape sequences
-
- def handle_FSTRING_MIDDLE_token(self): # replace the double braces '{{' or '}}' to single braces and hide the second braces
- fs_mid: str = self.cur_token.text
- fs_mid = fs_mid.replace("{{", "{_").replace("}}", "}_") # replace: {{ --> {_ and }} --> }_
- arrOfStr: list[str] = re.split(r"(?<=[{}])_", fs_mid) # split by {_ or }_
- s: str
- for s in arrOfStr:
- if s:
- self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, s, self.ffg_token)
- lastCharacter: str = s[-1:]
- if lastCharacter in "{}":
- self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.HIDDEN_CHANNEL, lastCharacter, self.ffg_token)
-
- def handle_fstring_lexer_modes(self):
- if self._modeStack:
- match self.cur_token.type:
- case self.LBRACE:
- self.pushMode(Lexer.DEFAULT_MODE)
- self.paren_or_bracket_opened_stack.append(0)
- case self.LPAR | self.LSQB:
- # https://peps.python.org/pep-0498/#lambdas-inside-expressions
- self.paren_or_bracket_opened_stack[-1] += 1 # increment the last element (peek() + 1)
- case self.RPAR | self.RSQB:
- self.paren_or_bracket_opened_stack[-1] -= 1 # decrement the last element (peek() - 1)
- case self.COLON:
- if self.paren_or_bracket_opened_stack[-1] == 0:
- match self._modeStack[-1]: # check the previous lexer mode (the current is DEFAULT_MODE)
- case self.SINGLE_QUOTE_FSTRING_MODE \
- | self.LONG_SINGLE_QUOTE_FSTRING_MODE \
- | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE:
-
- self.mode(self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.DOUBLE_QUOTE_FSTRING_MODE \
- | self.LONG_DOUBLE_QUOTE_FSTRING_MODE \
- | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
-
- self.mode(self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE) # continue in format spec. mode
- case self.RBRACE:
- match self._mode:
- case Lexer.DEFAULT_MODE \
- | self.SINGLE_QUOTE_FORMAT_SPECIFICATION_MODE \
- | self.DOUBLE_QUOTE_FORMAT_SPECIFICATION_MODE:
-
- self.popMode()
- self.paren_or_bracket_opened_stack.pop()
- case other:
- self.report_lexer_error("f-string: single '}' is not allowed")
-
- def handle_FORMAT_SPECIFICATION_MODE(self):
- if len(self._modeStack) != 0 \
- and self.ffg_token.type == self.RBRACE:
-
- match self.cur_token.type:
- case self.COLON | self.RBRACE:
- # insert an empty FSTRING_MIDDLE token instead of the missing format specification
- self.create_and_add_pending_token(self.FSTRING_MIDDLE, Token.DEFAULT_CHANNEL, "", self.ffg_token)
-
- def insert_trailing_tokens(self):
- match self.last_pending_token_type_from_default_channel:
- case self.NEWLINE | self.DEDENT:
- pass # no trailing NEWLINE token is needed
- case other:
- # insert an extra trailing NEWLINE token that serves as the end of the last statement
- self.create_and_add_pending_token(self.NEWLINE, Token.DEFAULT_CHANNEL, None, self.ffg_token) # _ffg_token is EOF
- self.insert_indent_or_dedent_token(0) # Now insert as much trailing DEDENT tokens as needed
-
- def handle_EOF_token(self):
- if self.last_pending_token_type_from_default_channel > 0:
- # there was statement in the input (leading NEWLINE tokens are hidden)
- self.insert_trailing_tokens()
- self.add_pending_token(self.cur_token)
-
- def hide_and_add_pending_token(self, cToken: CommonToken):
- cToken.channel = Token.HIDDEN_CHANNEL
- self.add_pending_token(cToken)
-
- def create_and_add_pending_token(self, type: int, channel: int, text: str, base_token: CommonToken):
- cToken: CommonToken = base_token.clone()
- cToken.type = type
- cToken.channel = channel
- cToken.stop = base_token.start - 1
- cToken.text = "<" + self.symbolicNames[type] + ">" if text is None else \
- text
-
- self.add_pending_token(cToken)
-
- def add_pending_token(self, token: CommonToken):
- # save the last pending token type because the _pending_tokens list can be empty by the nextToken()
- self.previous_pending_token_type = token.type
- if token.channel == Token.DEFAULT_CHANNEL:
- self.last_pending_token_type_from_default_channel = self.previous_pending_token_type
- self.pending_tokens.append(token)
-
- def get_indentation_length(self, textWS: str) -> int: # the textWS may contain spaces, tabs or form feeds
- TAB_LENGTH: int = 8 # the standard number of spaces to replace a tab to spaces
- length: int = 0
- ch: str
- for ch in textWS:
- match ch:
- case ' ':
- self.was_space_indentation = True
- length += 1
- case '\t':
- self.was_tab_indentation = True
- length += TAB_LENGTH - (length % TAB_LENGTH)
- case '\f': # form feed
- length = 0
-
- if self.was_tab_indentation and self.was_space_indentation:
- if not self.was_indentation_mixed_with_spaces_and_tabs:
- self.was_indentation_mixed_with_spaces_and_tabs = True
- return self.INVALID_LENGTH # only for the first inconsistent indent
- return length
-
- def report_lexer_error(self, err_msg):
- self.getErrorListenerDispatch().syntaxError(self, self.cur_token, self.cur_token.line, self.cur_token.column, " LEXER" + self.ERR_TXT + err_msg, None)
-
- def report_error(self, err_msg):
- self.report_lexer_error(err_msg)
-
- # the ERROR_TOKEN will raise an error in the parser
- self.create_and_add_pending_token(self.ERROR_TOKEN, Token.DEFAULT_CHANNEL, self.ERR_TXT + err_msg, self.ffg_token)
-
- def reset(self):
- self.init()
- super().reset()
diff --git a/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py b/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py
deleted file mode 100644
index ca07eed13d..0000000000
--- a/python/python3_12_1/tests/test_double_braces_in_fstring_literal.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# COMMAND LINE:
-# grun Python file_input -tokens test_double_braces_in_fstring_literal.py
-#
-# EXPECTATIONS:
-# - replace the double braces '{{' or '}}' to single braces: '{' or '}'
-# - inserted hidden second brace token (channel=1)
-# - no error message
-
-print(f"{{ {4*10} }}")
diff --git a/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py b/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py
deleted file mode 100644
index dd5f492a40..0000000000
--- a/python/python3_12_1/tests/test_lambda_colon_in_fstring_literal.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# COMMAND LINE:
-# grun Python file_input -tokens test_lambda_colon_in_fstring_literal.py
-#
-# EXPECTATIONS:
-# - the colon of the lambda expression is not a start of format specifier in the fstring literal
-# - no error message
-
-print(f"{(lambda x: x*2)(3)}")
diff --git a/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py b/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py
deleted file mode 100644
index 625b3be7e1..0000000000
--- a/python/python3_12_1/tests/test_missing format specification_in_fstring_literal.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# COMMAND LINE:
-# grun Python file_input -tokens test_missing_format_specification_in_fstring_literal.py
-#
-# EXPECTATIONS:
-# - inserted empty FSTRING_MIDDLE token instead of the missing format specification (after the colon)
-# - no error message
-
-print(f"{.070:}")
-
diff --git a/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py b/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py
deleted file mode 100644
index f14d73cb74..0000000000
--- a/python/python3_12_1/tests/test_string_literal_with_newline_escape_sequence.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# COMMAND LINE:
-# grun Python file_input -tokens test_string_literal_with_newline_escape_sequence.py
-#
-# EXPECTATIONS:
-# - removed \ escape sequence from the STRING token
-# - inserted hidden token (channel=1) with the original string literal
-# - no error message
-
-s = 'This string will not include \
-backslashes or newline characters.'