SQL Expression Parser and Builder

This is a rough draft of an SQL expression parser kit. What it does is help to break apart an SQL expression, manipulate it's parts, and then reconstruct it into another SQL expression. It's useful if you take some bits of SQL as input, and then insert that into a query. I'm using it to construct user-friendly labels for rows.

This example takes an expression and prepends 'destination.' to each table name. It parses the expression into a tree. Then, it sends a Visitor into the tree to rename the table names. Finally, another visitor traverses the tree to reconstruct the expression. For example, "concat(a,b)" is altered to "concat( `destination.a`, `destination.b` )". Parser code is derived partly from code in Wikipedia, and other web articles. Visitor code is from Wikipedia and slightly modified. This code has NOT been tested. (I put it on the web because I couldn't find this code anywhere, at least not in PHP.)



include_once 'SQLTokenizer.class.php';

/**
 * SQL Parser (yet another).  This one scans the sources three times: tokenize, add types, parse.
 * This should be more OO - the AST is in arrays, not objects.
 */
class SQLExprParser 
{
	var $AST;
	var $exprStack;
	var $listStack;
	var $typedArray;

	function SQLExprParser( &$tokenizer )
	{
		$this->typedArray =& $tokenizer->typedArray;
	}

	/**
	 * The parser is app specific.  It only parses expressions.
	 */
	function parse()
	{
		$offset =& $this->offset;
		$offset = 0;
		$this->AST = $this->pSUM();
	}
	function acceptType( $s )
	{
		//report( "acceptType $s .".$this->typedArray[$this->offset][0] );
		if ( $this->typedArray[ $this->offset ][1] == $s )
		{
			$token = $this->typedArray[ $this->offset ][0];
			$this->_next();
			return $token;
		}
		return false;
	}
	function expectType( $s )
	{
		//report( "expectType $s .".$this->typedArray[$this->offset][0] );
		if ( $token = $this->acceptType( $s ) )
			return $token;
		die("expectType: unexpected symbol $s. Next symbol is ".$this->typedArray[$this->offset][0]);
	}
	function _next()
	{
		$this->offset++;
	}
	/** 
	 * p means parse.
	 */
	function pSUM()
	{
		$expr = $this->pFACTOR();
		list($op,$list) = $this->pSUMREST();
		if ($op)
		{
			return new ASTNode( 'OPERATOR', $op, array($expr, $list) );
		}
		else
			return $expr;
	}
	function pSUMREST()
	{
		if ($token = $this->acceptType('OPERATOR'))
		{
			$expr = $this->pSUM();
			return array( $token, $expr );
		}
	}
	function pFACTOR()
	{
		if ( $token = $this->acceptType('STRING') ) 
		{
			return new ASTNode( 'STRING', $token );
		} 
		else if ( $token = $this->acceptType('BAREWORD') )
		{
			// if the next token is a paren, we have an error
			// because it looks like a function
			return new ASTNode( 'NAME', $token );
		}
		else if ( $token = $this->acceptType('NUMBER') )
		{
			return new ASTNode( 'NUMBER', $token );
		}
		else if ( $token = $this->acceptType('NAME') )
		{
			return new ASTNode( 'NAME', $token );
		} 
		else if ( $token = $this->acceptType('FUNCTION') )
		{
			$list = $this->pARGS();
			return new ASTNode( 'FUNCTION', $token, $list );
		} 
		else if ( $token = $this->acceptType('LEFTP') )
		{
			$expr = $this->pSUM();
			$this->expectType('RIGHTP');
			return new ASTNode ( 'GROUP', '',  $expr );
		} 
		else 
		{
			$expr = $this->pUNARY();
			return $expr;
		} 
	}
	function pUNARY()
	{
		$this->expectType('MINUS');
		$expr = $this->pFACTOR();
		return new ASTNode( 'UNARY', '-', $expr );
	}
	function pARGS()
	{
		//report('pARGS');
		$this->expectType('LEFTP');
		$list = $this->pLIST();
		$this->expectType('RIGHTP');
		return $list;
	}
	function pLIST()
	{
		//report('pEXPRLIST');
		$expr = $this->pSUM();
		$rest = $this->pLISTREST();
		return array_merge( array($expr), $rest );
	}
	function pLISTREST()
	{
		if ($this->acceptType('COMMA'))
		{
			return $this->pLIST();
		}
		return array();
	}
}

class ASTNode
{
	var $type;
	var $token;
	var $children;
	function ASTNode( $type, $token, $children=null )
	{
		$this->type = $type;
		$this->token = $token;
		$this->children = array();
		if ($children)
		{
			$this->addChildren( $children );
		}
	}
	function addChildren( &$children )
	{
		if (is_array($children))
		{
			$this->children =& array_merge($this->children, $children);
		}
		else // is not array
		{
			array_push($this->children,$children);
		}
	}
	/** 
	 * For visitor pattern. See wikipedia.
	 */
	function accept( &$visitor )
	{
		$visitor->visit($this);
	}
}

class ASTVisitor {
	var $autoRecursion = true;
	function ASTVisitor()
	{
		$this->autoRecursion = true;
	}
	/**
	 * Applies the action if the node is the type we want.
	 */
	function visit( &$node )
	{
		$method = "visit".$node->type;	
		if ($node->type && method_exists($this,$method))
		{
			//print ($method . " called<br>");
			$this->$method(&$node);
		}
		if ($this->autoRecursion)
		{
			$this->visitChildren( &$node );
		}
	}
	function visitChildren( &$node )
	{
		if (is_array($node->children))
		{
			foreach($node->children as $child)
			{
				$child->accept(&$this);
			}
		}
	}
	function visitChild( &$node, $index )
	{
		if ( isset( $node->children[$index] ) )
		{
			$node->children[$index]->accept(&$this);
		}
	}
}
class ConcretePrependerASTVisitor extends ASTVisitor
{
	var $prefix;

	function ConcretePrependerASTVisitor( $prefix, &$node )
	{
		$this->prefix = $prefix;
		$this->visit(&$node);
	}
	/** 
	 * Prepends the string 'destination.' to
	 * the name, if it doesn't already have a prefix.
	 * @param string $string to modify
	 */
	function _prepend( $string )
	{	
		$string = rtrim($string,'`');
		$string = ltrim($string,'`');
		if (preg_match('/\\./', $string))
			return '`'.$string.'`';
		return '`'.$this->prefix.'.'.$string.'`';
	}
	function visitNAME(&$node)
	{
		$node->token = $this->_prepend( $node->token );
		//print("setting token to ".$node->token."<br>");
	}
}

class SQLExprBuilderASTVisitor extends ASTVisitor
{
	var $output;
	var $head;
	/**
	 * This resconstructs the expression from the AST.
	 * The AST is a lisp-like functional tree, without operator precedence.
	 */
	function SQLExprBuilderASTVisitor( &$node )
	{
		$this->output = '';
		$this->autoRecursion = false;
		$this->head = $node;
	}
	function asString()
	{
		$this->visit($this->head);
		return $this->output;
	}

	function visitFUNCTION(&$node) 
	{ 
		$this->output .= $node->token.'( ';
		$count = 0;
		foreach($node->children as $child)
		{
			if ($count) $this->output.=', ';
			$child->accept(&$this);
			$count++;
		}
		$this->output .= ' )';
	}
	function visitNAME(&$node) 
	{ 
		$this->output .= $node->token;
	}
	function visitOPERATOR(&$node) 
	{ 
		$this->visitChild( &$node, 0 );
		$this->output .= ' '.$node->token.' ';
		$this->visitChild( &$node, 1 );
	}
	function visitSTRING(&$node) 
	{ 
		$this->output .= $node->token;
	}
	function visitNUMBER(&$node) 
	{ 
		$this->output .= $node->token;
	}
}

The SQLTokenizer class is on another page (follow the link). (This is all just for reading, not using. The whole package will be up elsewhere.)

Here's a testing page.


include('SQLExprParser.class.php');

if (isset($_GET['sql']))
{
	print "
";
	//print_r(tokenizeSQL($_GET['sql']));
	$t = new SQLTokenizer();
	$t->tokenize( $_GET['sql'] );
	/*
	print_r($t->tokenArray);
	print "--------------\n";
	print_r($t->typedArray);
	print "--------------\n";
	*/
	$p = new SQLExprParser( &$t );
	$p->parse();
	$v = new ConcretePrependerASTVisitor( 'destination', &$p->AST );
	print_r($p->AST);
	print "--------------\n";
	$v = new SQLExprBuilderASTVisitor( &$p->AST );
	$output = $v->asString();
	print $output;
	print "

";
}

?>

<?=$_GET['sql']?></textarea>

This is the yacc/bison compatible grammar that I used to start off. I got it, partly, from the BNF Web Club.


%token NAME
%token NUMBER
%token STRING
%token BAREWORD
%token FUNCTION
%token OPERATOR
%%

/* 
   This bison file was used to test the grammar for LALR validity.
   The parser was generated by hand from this.

	To make the parser LL, remove left recursion.
	A clean way is to break left-recursive productions
	at terminal tokens.
 */

/* sum : factor { operator factor } */

SUM : FACTOR SUMREST

SUMREST : OPERATOR SUM
	|

FACTOR : FUNCTION ARGS
	| NAME 
	| NUMBER
	| STRING 
	| BAREWORD
	| '(' SUM ')'
	| UNARY
	|
	
UNARY :  '-' FACTOR
ARGS : '(' LIST ')'

/* list : sum { ',' sum } */

LIST : SUM LISTREST

LISTREST : ',' LIST
	|