| <?php namespace Sieve; |
| |
| include_once('SieveToken.php'); |
| |
| class SieveScanner |
| { |
| public function __construct(&$script) |
| { |
| if ($script === null) |
| return; |
| |
| $this->tokenize($script); |
| } |
| |
| public function setPassthroughFunc($callback) |
| { |
| if ($callback == null || is_callable($callback)) |
| $this->ptFn_ = $callback; |
| } |
| |
| public function tokenize(&$script) |
| { |
| $pos = 0; |
| $line = 1; |
| |
| $scriptLength = mb_strlen($script); |
| |
| $unprocessedScript = $script; |
| |
| |
| //create one regex to find the right match |
| //avoids looping over all possible tokens: increases performance |
| $nameToType = []; |
| $regex = []; |
| // chr(65) == 'A' |
| $i = 65; |
| |
| foreach ($this->tokenMatch_ as $type => $subregex) { |
| $nameToType[chr($i)] = $type; |
| $regex[] = "(?P<". chr($i) . ">^$subregex)"; |
| $i++; |
| } |
| |
| $regex = '/' . join('|', $regex) . '/'; |
| |
| while ($pos < $scriptLength) |
| { |
| if (preg_match($regex, $unprocessedScript, $match)) { |
| |
| // only keep the group that match and we only want matches with group names |
| // we can use the group name to find the token type using nameToType |
| $filterMatch = array_filter(array_filter($match), 'is_string', ARRAY_FILTER_USE_KEY); |
| |
| // the first element in filterMatch will contain the matched group and the key will be the name |
| $type = $nameToType[key($filterMatch)]; |
| $currentMatch = current($filterMatch); |
| |
| //create the token |
| $token = new SieveToken($type, $currentMatch, $line); |
| $this->tokens_[] = $token; |
| |
| if ($type == SieveToken::Unknown) |
| return; |
| |
| // just remove the part that we parsed: don't extract the new substring using script length |
| // as mb_strlen is \theta(pos) (it's linear in the position) |
| $matchLength = mb_strlen($currentMatch); |
| $unprocessedScript = mb_substr($unprocessedScript, $matchLength); |
| |
| $pos += $matchLength; |
| $line += mb_substr_count($currentMatch, "\n"); |
| } else { |
| $this->tokens_[] = new SieveToken(SieveToken::Unknown, '', $line); |
| return; |
| } |
| |
| } |
| |
| $this->tokens_[] = new SieveToken(SieveToken::ScriptEnd, '', $line); |
| } |
| |
| public function nextTokenIs($type) |
| { |
| return $this->peekNextToken()->is($type); |
| } |
| |
| public function peekNextToken() |
| { |
| $offset = 0; |
| do { |
| $next = $this->tokens_[$this->tokenPos_ + $offset++]; |
| } while ($next->is(SieveToken::Comment|SieveToken::Whitespace)); |
| |
| return $next; |
| } |
| |
| public function nextToken() |
| { |
| $token = $this->tokens_[$this->tokenPos_++]; |
| |
| while ($token->is(SieveToken::Comment|SieveToken::Whitespace)) |
| { |
| if ($this->ptFn_ != null) |
| call_user_func($this->ptFn_, $token); |
| |
| $token = $this->tokens_[$this->tokenPos_++]; |
| } |
| |
| return $token; |
| } |
| |
| protected $ptFn_ = null; |
| protected $tokenPos_ = 0; |
| protected $tokens_ = array(); |
| protected $tokenMatch_ = array ( |
| SieveToken::LeftBracket => '\[', |
| SieveToken::RightBracket => '\]', |
| SieveToken::BlockStart => '\{', |
| SieveToken::BlockEnd => '\}', |
| SieveToken::LeftParenthesis => '\(', |
| SieveToken::RightParenthesis => '\)', |
| SieveToken::Comma => ',', |
| SieveToken::Semicolon => ';', |
| SieveToken::Whitespace => '[ \r\n\t]+', |
| SieveToken::Tag => ':[[:alpha:]_][[:alnum:]_]*(?=\b)', |
| /* |
| " # match a quotation mark |
| ( # start matching parts that include an escaped quotation mark |
| ([^"]*[^"\\\\]) # match a string without quotation marks and not ending with a backlash |
| ? # this also includes the empty string |
| (\\\\\\\\)* # match any groups of even number of backslashes |
| # (thus the character after these groups are not escaped) |
| \\\\" # match an escaped quotation mark |
| )* # accept any number of strings that end with an escaped quotation mark |
| [^"]* # accept any trailing part that does not contain any quotation marks |
| " # end of the quoted string |
| */ |
| SieveToken::QuotedString => '"(([^"]*[^"\\\\])?(\\\\\\\\)*\\\\")*[^"]*"', |
| SieveToken::Number => '[[:digit:]]+(?:[KMG])?(?=\b)', |
| SieveToken::Comment => '(?:\/\*(?:[^\*]|\*(?=[^\/]))*\*\/|#[^\r\n]*\r?(\n|$))', |
| SieveToken::MultilineString => 'text:[ \t]*(?:#[^\r\n]*)?\r?\n(\.[^\r\n]+\r?\n|[^\.][^\r\n]*\r?\n)*\.\r?(\n|$)', |
| SieveToken::Identifier => '[[:alpha:]_][[:alnum:]_]*(?=\b)', |
| SieveToken::Unknown => '[^ \r\n\t]+' |
| ); |
| } |