How to implement the context-related lexical analysis?
2 posts in topic
Flat View  Flat View
TOPIC ACTIONS:
 

Posted By:   Lansong_Diao
Posted On:   Tuesday, April 15, 2003 08:46 PM

I know that lex can get context information by ""start"" directive. Some strings will be returned as different token in different context. How to do it in Antlr?

Re: How to implement the context-related lexical analysis?

Posted By:   Gunnar_Wagenknecht  
Posted On:   Monday, May 19, 2003 09:29 AM

I've done this for a lexer for the Harbour programming language with a custom TokenStream filter implementation. This wraps the lexer and anylzes the tokens. It allowed my to have IDENTIFIERS sometimes beeing matched as KEYWORDS and sometime only as IDENTIFIERS. Attached you will find the a mail that describes the idea.


-----Original Message-----
From: Terence Parr [mailto:parrt@jguru.com]
Sent: Friday, March 21, 2003 6:35 PM
To: Gunnar Wagenknecht
Subject: Re: Possible Features

On Thursday, March 13, 2003, at 11:59 AM, Gunnar Wagenknecht wrote:

> ->keywords are context sensitiv, somtimes they are keywords and in
> ->some
> context they are just IDENTIFIERS

ouch!

>
> I could work around that using a custom TokenStream filter that
> analyzes
> the token stream from the lexer and returns the identified token or
> reduces it to an identifier if necessary.
>


And the class:

/**
* Harbour Development Tools
* $Source: $
* $Revision: $
*
* (c) Copyright Gunnar Wagenknecht (Planet-Wagenknecht.de)
* (c) Copyright QNX Software Systems Ltd. 2002
* (c) Copyright IBM Corp. 2000, 2001, 2002
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Common Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/cpl-v10.html
*
* Contributors:
* Gunnar Wagenknecht - API and implementation for Harbour
*
*/
package org.harbour.ide.internal.core.parser;

import java.util.LinkedList;

import antlr.CommonHiddenStreamToken;
import antlr.Token;
import antlr.TokenStream;
import antlr.TokenStreamException;
import antlr.TokenStreamHiddenTokenFilter;

/**
* Specialized TokenStream for the Harbour programming
* language.
*
* The HarbourLexer uses this TokenStream. It
* is an essential part for recognizing the Harbour language because it
* needs some rules because of context sensitive token meanings
* (eg. most keywords arn't reserver in Harbour, some are only
* valid on line starts).
*
* @author GWagenknecht
*/
class HarbourLanguageTokenStream
extends TokenStreamHiddenTokenFilter
implements HarbourTokenTypes
{
/**
* Indicates if a key word type wants a variable (including macros).
* true for
* DO
* FOR
* PRIVATE
* PUBLIC
*/
public static final boolean wantsVar(int tokenType)
{
switch (tokenType)
{
case DO :
return true;
case FOR :
return true;
case PRIVATE :
return true;
case PUBLIC :
return true;

default :
return false;
}
}

/**
* Reduces the given token to one token of the specified type.
* @param tokenType
* @param tokens
* @return the new token
*/
public static final Token reduceToOne(int tokenType, Token tokens[])
{
int startLine = 0, endLine = 0, startColumn = 0, endColumn = 0;
StringBuffer text = new StringBuffer();
CommonHiddenStreamToken hiddenBefore = null;
CommonHiddenStreamToken hiddenAfter = null;

if (tokens.length <= 0)
throw new IllegalArgumentException("tokens is empty");

for (int i = 0; i < tokens.length; i++)
{
ExtendedToken token = (ExtendedToken) tokens[i];
if (i == 0)
{
startLine = token.getLine();
startColumn = token.getColumn();
endLine = token.endLine;
endColumn = token.endColumn;
text.append(token.getText());
hiddenBefore = token.getHiddenBefore();
hiddenAfter = token.getHiddenAfter();
}
else
{
while (endLine < token.getLine())
{
text.append("
");
endLine++;
endColumn = 0;
}
while (endColumn < token.getColumn())
{
text.append(" ");
endColumn++;
}

endLine = token.endLine;
endColumn = token.endColumn;
text.append(token.getText());
hiddenAfter = token.getHiddenAfter();
}

}
ExtendedToken token =
new ExtendedToken(
tokenType,
text.toString(),
startLine,
startColumn,
endLine,
endColumn);

token.setHiddenBefore(hiddenBefore);
token.setHiddenAfter(hiddenAfter);
return token;
}

/**
* Splits the given multi-word token into single word tokens
* @param token
* @param tokenTypes
* @return array with the new tokens
*/
public static final Token[] splitInto(Token token, int tokenTypes[])
{
if (tokenTypes.length <= 0)
throw new IllegalArgumentException("array of token types is empty");

LinkedList tokens = new LinkedList();
StringBuffer tokenText = new StringBuffer();
char[] text = token.getText().toCharArray();
for (int i = 0; i < text.length; i++)
{
char ch = text[i];
if (ch == ' ' || ch == ' ' || ch == 'f')
{
if (tokenText.length() > 0)
{
ExtendedToken newToken =
new ExtendedToken(
tokens.size(),
tokenText.toString(),
token.getLine(),
token.getColumn() + i,
token.getLine(),
token.getColumn() + i + tokenText.toString().length());
tokens.add(newToken);
tokenText.setLength(0);
}
}
else
{
tokenText.append(ch);
}
}

((ExtendedToken) tokens.getFirst()).setHiddenBefore(
((ExtendedToken) token).getHiddenBefore());
((ExtendedToken) tokens.getLast()).setHiddenAfter(
((ExtendedToken) token).getHiddenAfter());

return (Token[]) tokens.toArray(new Token[tokens.size()]);
}

/**
* Reduced the given token to type IDENTIFIER.
* @param token
* @return the token for convince
*/
public static final Token reduceToIdentifier(Token token)
{
token.setType(IDENTIFIER);
return token;
}

/**
* Reduced the given token to the specified type.
* @param token
* @param type
* @return the token for convince
*/
public static final Token reduceTo(Token token, int type)
{
token.setType(type);
return token;
}

/**
* Doesn't modify the token. This methode exists for a easier
* reading of the source code.
* @param token
* @return the token for convince
*/
public static final Token passThrough(Token token)
{
token.setType(IDENTIFIER);
return token;
}

/**
* Indicates if a key word type wants an identifier.
*/
public static final boolean wantsId(int tokenType)
{
switch (tokenType)
{
case ANNOUNCE :
return true;
case CATCH :
return true;
case EXTERN :
return true;
case FIELD :
return true;
case FUNCTION :
return true;
case LOCAL :
return true;
case GLOBAL :
return true;
case EXTERNGLOBAL :
return true;
case MEMVAR :
return true;
case PARAMETERS :
return true;
case PROCEDURE :
return true;
case ENUM :
return true;
case STATIC :
return true;

default :
return false;
}
}

/**
* Indicates if a key word type wants an end of line.
*/
public static final boolean wantsEOL(int tokenType)
{
switch (tokenType)
{
case ELSE :
return true;
case END :
return true;
case ENDCASE :
return true;
case ENDDO :
return true;
case ENDIF :
return true;
case EXIT :
return true;
case LOOP :
return true;
case NEXT :
return true;
case OTHERWISE :
return true;
case RECOVER :
return true;
case TRY :
return true;

default :
return false;
}
}

/**
* Indicates if a key word type wants an expression.
*/
public static final boolean wantsExpr(int tokenType)
{
switch (tokenType)
{
case BREAK :
return true;
case CASE :
return true;
case ELSEIF :
return true;
case IF :
return true;
case IN :
return true;
case RETURN :
return true;
case WHILE :
return true;
case WITHOBJ :
return true;
case SWITCH :
return true;

default :
return false;
}
}

/** wants everything except an operator */
public static final int REJECT_OP = 707;

public static final boolean isOperator(int tokenType)
{
return (
tokenType == INC
|| tokenType == DEC
|| tokenType == INASSIGN
|| tokenType == ALIASOP
|| tokenType == PLUSEQ
|| tokenType == MINUSEQ
|| tokenType == MULTEQ
|| tokenType == DIVEQ
|| tokenType == EXPEQ
|| tokenType == MODEQ
|| tokenType == EQUALS);
}

/**
* Creates a new HarbourLanguageTokenStream instance.
* @param input
*/
public HarbourLanguageTokenStream(TokenStream input)
{
super(input);
hide(COMMENT_LINE);
hide(COMMENT_BLOCK);
hide(COMMENT_HARBOURDOC);
hide(NEWLINE_CONTINUE);
queuedTokens = new LinkedList();
}

/** the token queue */
private LinkedList queuedTokens;

/** the last token (for look back) */
private Token lastToken;

/* (non-Javadoc)
* @see antlr.TokenStream#nextToken()
*/
public Token nextToken() throws TokenStreamException
{
Token token;

// pull out queued tokens if empty
if (!queuedTokens.isEmpty())
token = (Token) queuedTokens.getFirst();
else
{
token = super.nextToken();
queuedTokens.addFirst(token);
}

// don't analyze common types
switch (token.getType())
{
case EOF :
case IDENTIFIER :
case FUNCTION :
case PROCEDURE :
case INIT_FUNCTION :
case INIT_PROCEDURE :
case STATIC_FUNCTION :
case STATIC_PROCEDURE :
case EXIT_FUNCTION :
case EXIT_PROCEDURE :
case NUM_DOUBLE :
case NUM_HEX :
case NUM_INT :
case NUM_LONG :
case ALIASOP :
case AT :
case COLON :
case DEC :
case INC :
case DIV :
case DIVEQ :
case DOLLAR :
case DOT :
case EPSILON :
case EQ :
case EQUALS :
case EXPEQ :
case GT :
case GE :
case LT :
case LE :
case MOD :
case MODEQ :
case MULT :
case MULTEQ :
case SEMI :
case NEWLINE :
case PIPE :
case MINUS :
case MINUSEQ :
case PLUS :
case PLUSEQ :
case POWER :
case NOT :
case AND :
case OR :
case TRUE :
case FALSE :
case TRANSLATEOP :
case MACROOP :
case INASSIGN :
case NE :
case LPAREN :
case RPAREN :
case LBRACK :
case RBRACK :
case LCURLY :
case RCURLY :
case LITERAL :
case COMMA :
return (Token) queuedTokens.removeFirst();
}

int tokenType = token.getType();

// KeyWords are grouped based on the next token they require.

/* Wants any Var. */
if (wantsVar(tokenType))
{
switch (lookahead(1).getType())
{
case IDENTIFIER :
case MACROVAR :
case MACROTEXT :
case MACROOP :
passThrough(token);
break;
default :
reduceToIdentifier(token);
break;
}
}

/* Wants Identifier. */
else if (wantsId(tokenType))
{
switch (lookahead(1).getType())
{
case IDENTIFIER :
passThrough(token);
break;
default :
reduceToIdentifier(token);
break;
}
}

/* Wants EOL */
else if (wantsEOL(tokenType))
{
switch (lookahead(1).getType())
{
case NEWLINE :
case SEMI :
passThrough(token);
break;

default :
reduceToIdentifier(token);
break;
}
}

/* Wants Expression ( DOESN'T WANT OPERATORS )
* At BOL, if followed by operator (other than logicals .t., .f., !)
* than Identifier else COMMAND.
*/
else if (wantsExpr(tokenType))
{

switch (lookahead(1).getType())
{
case INC :
case DEC :
passThrough(token);
break;

case INASSIGN :
case ALIASOP :
case PLUSEQ :
case MINUSEQ :
case MULTEQ :
case DIVEQ :
case EXPEQ :
case MODEQ :
case EQUALS :
reduceToIdentifier(token);
break;

case OPTIONAL :
passThrough(token);
reduceToIdentifier(lookahead(1));
break;

default :
passThrough(token);
Token next = lookahead(1);
if (wantsId(next.getType()))
reduceToIdentifier(next);
else if (wantsExpr(next.getType()))
reduceToIdentifier(next);
else if (wantsVar(next.getType()))
reduceToIdentifier(next);
break;
}
}

else
{
switch (tokenType)
{
/* DECLARE as PRIVATE. */
case DECLARE :
{
switch (lookahead(1).getType())
{
case IDENTIFIER :
switch (lookahead(2).getType())
{
case LBRACK :
case COMMA :
case NEWLINE :
case SEMI :
case INASSIGN :
reduceTo(token, PRIVATE);
break;

/* Any other DECLARE IDENTIFIER, must
* be Strong Type DECLARE.
*/
default :
passThrough(token);
break;
}
break;

case MACROVAR :
case MACROTEXT :
reduceTo(token, PRIVATE);
break;

default :
/* Any other DECLARE must be Identifier. */
reduceToIdentifier(token);
break;
}
}
break;

/* FIELD_ALIAS is NOT at BOL wants only "->". */
case FIELD_ALIAS :
{
if (lookahead(1).getType() == ALIASOP)
passThrough(token);
else
reduceToIdentifier(token);
}
break;

/* reduce "qself()" to SELF */
case QSELF :
{
if (lookahead(1).getType() == LPAREN
&& lookahead(2).getType() == RPAREN)
{
// reduce "qself" '(' ')' to one token
Token self =
reduceToOne(
SELF,
new Token[] {
token,
(Token) queuedTokens.remove(1),
(Token) queuedTokens.remove(1)});
queuedTokens.set(0, self);
}
else
reduceToIdentifier(token);
}
break;

/* FOR EACH */
case FOREACH :
{
if (lookahead(1).getType() == IDENTIFIER)
passThrough(token);
else
reduceToIdentifier(token);
}
break;

/* DO CASE */
case DOCASE :
{
switch (lookahead(1).getType())
{
case NEWLINE :
case SEMI :
passThrough(token);
break;

default :
// split "do case" into DO + IDENTIFIER
Token[] tokens =
splitInto(token, new int[] { DO, IDENTIFIER });
queuedTokens.set(0, tokens[0]);
queuedTokens.add(1, tokens[1]);
break;
}
}
break;

/* DO WHILE WITH */
case DO_WHILE_WITH :
{
switch (lookahead(1).getType())
{
case NEWLINE :
case SEMI :
// reduce "do while with" to WHILE + WITH
Token[] splittedTokens =
splitInto(token, new int[] { DO, WHILE, WITH });
Token whileToken =
reduceToOne(
WHILE,
new Token[] {
splittedTokens[0],
splittedTokens[1] });
queuedTokens.set(0, whileToken);
queuedTokens.add(1, splittedTokens[2]);
break;

default :
// split "do while with" into DO + IDENTIFIER + WITH
Token[] tokens =
splitInto(token, new int[] { DO, IDENTIFIER, WITH });
queuedTokens.set(0, tokens[0]);
queuedTokens.add(1, tokens[1]);
queuedTokens.add(2, tokens[2]);
break;
}
}
break;

/* DO WHILE */
case DO_WHILE :
{
switch (lookahead(1).getType())
{
case NEWLINE :
case SEMI :
// split "do while" into DO + IDENTIFIER
Token[] tokens =
splitInto(token, new int[] { DO, IDENTIFIER });
queuedTokens.set(0, tokens[0]);
queuedTokens.add(1, tokens[1]);
break;

default :
// reduce to WHILE
reduceTo(token, WHILE);
break;
}
}
break;

/* OPTIONAL */
case OPTIONAL :
{
switch (lookahead(1).getType())
{
case IDENTIFIER :
case AT :
passThrough(token);
break;

default :
reduceToIdentifier(token);
break;
}
}
break;

/* TO or STEP */
case TO :
case STEP :
{
switch (lastToken.getType())
{
case NUM_INT :
case NUM_LONG :
case NUM_DOUBLE :
case IDENTIFIER :
case MACROVAR :
case MACROTEXT :
case RPAREN :
case RBRACK :
case LITERAL :
passThrough(token);
break;

default :
reduceToIdentifier(token);
break;
}
}
break;

/* WITH */
case WITH :
{
switch (lastToken.getType())
{
case IDENTIFIER :
case MACROVAR :
case MACROTEXT :
case RPAREN :
passThrough(token);
break;

default :
reduceToIdentifier(token);
break;
}
}
break;

/* IN */
case IN :
{
switch (lastToken.getType())
{
case IDENTIFIER :
passThrough(token);
break;

default :
reduceToIdentifier(token);
break;
}
}
break;
}
}

return (Token) queuedTokens.removeFirst();
}

/**
* Returns the token at the position ahead.
* @param position
* @return
* @throws TokenStreamException
*/
protected Token lookahead(int position) throws TokenStreamException
{
if (queuedTokens.size() > position && null != queuedTokens.get(position))
{
return (Token) queuedTokens.get(position);
}
Token token = super.nextToken();
queuedTokens.addLast(token);
if (queuedTokens.indexOf(token) != position)
throw new TokenStreamException("Token queue index out of sync!");

return token;
}
}

Re: How to implement the context-related lexical analysis?

Posted By:   Monty_Zukowski  
Posted On:   Thursday, April 17, 2003 08:15 AM

Look into the documentation about lexical states. http://www.antlr.org/doc/lexer.html#Lexical_States.
About | Sitemap | Contact