\documentclass[12pt]{beamer} \mode \author{Aleksej Saushev \\ \texttt{asau@inbox.ru}} \date{Some day} \title[Crash course]{Parsing for Impatient} \begin{document} \frame{\titlepage} \begin{frame}[fragile]{Context-free grammars} Context-free grammar (CFG) is: \begin{itemize} \item Alphabet, set of terminal symbols, ``terminals.'' \item Set of non-terminal symbols, ``nonterminals.'' \item Starting symbol. \item Rules (productions) like: \begin{eqnarray*} S & \rightarrow & S_0 S_1 S_2 ... S_{n-1} \\ S & \rightarrow & \_ \end{eqnarray*} \end{itemize} Terminals and nonterminals are disjunctive. Recursive rules allowed. \end{frame} \begin{frame}[fragile]{Context-free grammars} Sample grammar: \begin{itemize} \item Alphabet of decimal digits and arithmetic operations ``$+$'' and ``$-$''. \item $E$ is the only nonterminal. \item $E$ is starting symbol \item Rules: \begin{eqnarray*} E & \rightarrow & 1 \\ E & \rightarrow & E + E \end{eqnarray*} \end{itemize} \end{frame} \begin{frame}[fragile]{Context-free grammars} CFG is ``generative'' \begin{equation*} E \rightarrow E + E \rightarrow E + E + E \rightarrow E + 1 + E \rightarrow E + 1 + 1 \rightarrow 1 + 1 + 1 \end{equation*} Sentences of the language are \underline{produced} from start symbol by rewriting strings of symbols following given rules. \end{frame} \begin{frame}[fragile]{Unger} Straightforward interpretation of grammar rule: \\ symbols or rules matche substring of input string between given points. \begin{itemize} \item Terminal symbol matches substring of length 1 and only if the input symbol matches. \item Non-terminal symbol matches, if there's rule for that symbol that matches. \item Empty rule matches empty substring. \item Rule $S \rightarrow S_0 S_1 S_2 ... S_{n-1}$ matches substring of input string between points $p$ and $p'$, if there exist points $p = p_0 \leq p_1 \leq p_2 \leq ... \leq p_{n-1} \leq p_n = p'$ such that $S_k$ matches substring between points $p_{k}$ and $p_{k+1}$, terminal symbols match equal terminal symbols \end{itemize} Input string matches start symbol. \end{frame} \begin{frame}[fragile]{Unger} Formally: \begin{itemize} \item $s(S, w, p, p') := t(S), p' = p + 1, w[p] \sim T$, \item $s(S, w, p, p') := nt(S), r(S \rightarrow S_0 ... S_{n-1}, w, p, p')$, \item $r(S \rightarrow \_, w, p, p') := p' = p$, \item \begin{multline*} r(S \rightarrow S_0 ... S_{n-1}, w, p, p') := \\ \begin{aligned} & p = p_0 \leq ... \leq p_{n-1} \leq p_n = p', \\ & s(S_0, w, p_0, p_1), \\ & s(S_1, w, p_1, p_2), \\ & ... \\ & s(S_{n-1}, w, p_{n-1}, p_n). \end{aligned} \end{multline*} \end{itemize} \end{frame} \begin{frame}[fragile]{Unger} Strings are finite: \begin{equation*} 0 \leq p \leq l \end{equation*} \pause \begin{center} Brute force search. \end{center} \end{frame} \begin{frame}[fragile]{Unger} Left recursion: \begin{eqnarray*} S & \rightarrow & S S, \\ S & \rightarrow & \_ \end{eqnarray*} \begin{eqnarray*} && s(S, w, 0, 0), \\ && r(S \rightarrow S S, w, 0, 0), \\ && s(S, w, 0, 0), s(S, w, 0, 0), \\ && r(S \rightarrow S S, w, 0, 0), s(S, w, 0, 0), \\ && ... \end{eqnarray*} \end{frame} \begin{frame}[fragile]{Unger} Left recursion: \begin{eqnarray*} S & \rightarrow & S S \end{eqnarray*} \begin{eqnarray*} && s(S, w, 0, 0), \\ && r(S \rightarrow S S, w, 0, 0), \\ && s(S, w, 0, 0), s(S, w, 0, 0), \\ && r(S \rightarrow S S, w, 0, 0), s(S, w, 0, 0), \\ && ... \end{eqnarray*} \end{frame} \begin{frame}[fragile]{Unger} Reentered rule matcher with the same arguments. Elimination of non-producing cases. \end{frame} \begin{frame}[fragile]{Unger} Recursion elimination. \begin{multline*} \begin{aligned} r(S \rightarrow S_0 ... S_{n-1}, w, p, p') := \\ {\rm (if~matching~already,~fail~immediately;} \\ {\rm otherwise~mark~matching),}\\ p = p_0 \leq ... \leq p_{n-1} \leq p_n = p', \\ s(S_0, w, p_0, p_1), \\ s(S_1, w, p_1, p_2), \\ ... \\ s(S_{n-1}, w, p_{n-1}, p_n). \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Unger} Exponential time. \begin{multline*} \begin{aligned} r(S \rightarrow S_0 ... S_{n-1}, w, p, p') := \\ {\rm (if~matching~already,~fail~immediately;} \\ {\rm otherwise~mark~matching),} \\ {\rm (if~have~result,~return~result~immediately),} \\ p = p_0 \leq ... \leq p_{n-1} \leq p_n = p', \\ s(S_0, w, p_0, p_1), \\ s(S_1, w, p_1, p_2), \\ ... \\ s(S_{n-1}, w, p_{n-1}, p_n), \\ {\rm (remember~result).} \end{aligned} \end{multline*} No longer exponential. (Books fail to tell you that!) \end{frame} \begin{frame}[fragile]{Combinators} \begin{eqnarray*} r'(R, w, p) & := & \{p' | r(R, w, p, p')\}. \\ s'(S, w, p) & := & \{p' | s(S, w, p, p')\}. \end{eqnarray*} \begin{eqnarray*} r'(S \rightarrow S_0 ... S_{n-1}, w, p) & := & \{p' | r(S \rightarrow S_0 ... S_{n-1}, w, p, p')\}. \\ s'(S, w, p) & := & \{p' | s(S, w, p, p')\}. \end{eqnarray*} \begin{eqnarray*} r(R, w, p, p') & \Leftrightarrow & p' \in r'(R, w, p). \\ s(S, w, p, p') & \Leftrightarrow & p' \in s'(S, w, p). \end{eqnarray*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} r'(S \rightarrow \_, w, p) = \{p' | r(S \rightarrow _, w, p, p')\} = \{p' | p' = p\} = \{p\}. \end{equation*} \begin{multline*} s'(T, w, p) = \{p' | s(T, w, p, p')\} \\ = \{p' | p' = p + 1, w[p] \sim T\} \\ = \{p+1 | w[p] \sim T\}. \end{multline*} \begin{eqnarray*} s'(T, w, p) & = & \{p+1\},\quad {\rm when~} w[p] \sim T. \\ s'(T, w, p) & = & \{\}, \quad {\rm otherwise}. \end{eqnarray*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} s'(S, w, p) = \{p' | s(S, w, p, p')\} \\ = \{p' | R = S \rightarrow S_0 ... S_{n-1}, r(R, w, p, p')\} \\ = \{p' | R = S \rightarrow S_0 ... S_{n-1}, p' \in r'(R, w, p)\} \\ = \bigcup \{r'(R, w, p) | R = S \rightarrow S_0 ... S_{n-1}\}. \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} r'(S \rightarrow S_0 ... S_{n-1}, w, p) = \{p' | r(S \rightarrow S_0 ... S_{n-1}, w, p, p')\} \end{equation*} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p' | & p = p_0 \leq ... \leq p_{n-1} \leq p_n = p', & \\ & s(S_0, w, p_0, p_1), & \\ & s(S_1, w, p_1, p_2), & \\ & ... & \\ & s(S_{n-1}, w, p_{n-1}, p_n)\} & \\ \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p' | & p = p_0 \leq ... \leq p_{n-1} \leq p_n = p', \\ & s(S_0, w, p_0, p_1), \\ & s(S_1, w, p_1, p_2), \\ & ... \\ & s(S_{n-1}, w, p_{n-1}, p_n)\} \\ \end{aligned} \end{multline*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p' | & p_0 = p, p' = p_n, \\ & p_1 \in s'(S_0, w, p_0), \\ & p_2 \in s'(S_1, w, p_1), \\ & ... \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p' | & p_0 = p, p' = p_n, \\ & p_1 \in s'(S_0, w, p_0), \\ & p_2 \in s'(S_1, w, p_1), \\ & ... \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} \end{aligned} \end{multline*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_1 \in s'(S_0, w, p), \\ & p_2 \in s'(S_1, w, p_1), \\ & ... \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_1 \in s'(S_0, w, p), \\ & p_2 \in s'(S_1, w, p_1), \\ & ... \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} \end{aligned} \end{multline*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_{n-1} \in \{p_{n-1} | & p_1 \in s'(S_0, w, p), & \\ & & p_2 \in s'(S_1, w, p_1), & \\ & & ... & \\ & & p_{n-1} \in s'(S_{n-2}, w, p_{n-2})\}, & \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} & \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_{n-1} \in \{p_{n-1} | & p_1 \in s'(S_0, w, p), & \\ & & p_2 \in s'(S_1, w, p_1), & \\ & & ... & \\ & & p_{n-1} \in s'(S_{n-2}, w, p_{n-2})\}, & \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} & \end{aligned} \end{multline*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_{n-1} \in r'(S \rightarrow S_0 ... S_{n-2}, w, p) \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} \{y | x \in X, y \in f(x)\} = \bigcup \{f(x) | x \in X\} = u(m(f, X)), \end{equation*} where $m(f, X)$ is pointwise map of set X (``map''), $u(Y)$ is union of set of sets. \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} \{y | x \in X, y \in f(x)\} = \bigcup \{f(x) | x \in X\} = u(m(f, X)), \end{equation*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ = \{p_n | & p_{n-1} \in r'(S \rightarrow S_0 ... S_{n-2}, w, p) \\ & p_n \in s'(S_{n-1}, w, p_{n-1})\} & \end{aligned} \end{multline*} % \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) = u(m(&(p \mapsto s'(S_{n-1}, w, p)), \\ &r'(S \rightarrow S_0 ... S_{n-2}, w, p) \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{multline*} \begin{aligned} r'(S \rightarrow S_0 ... S_{n-1}, w, p) = u(m(&(p \mapsto s'(S_{n-1}, w, p)), \\ &r'(S \rightarrow S_0 ... S_{n-2}, w, p) \end{aligned} \end{multline*} % \begin{multline*} r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ \begin{aligned} = u(m(&(p \mapsto s'(S_{n-1}, w, p)), \\ &u(m(&(p \mapsto s'(S_{n-2}, w, p)), \\ r'(S \rightarrow S_0 ... S_{n-2}, w, p) \end{aligned} \end{multline*} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = {p_n | p_1 <- s'(S_0, w, p), p_2 <- s'(S_1, w, p_1), ... p_{n-1} <- s'(S_{n-1}, w, p_{n-2}), p_n <- s'(S_{n-1}, w, p_{n-1})} = {p_n | p_{n-1} <- {p_{n-1} | p_1 <- s'(S_0, w, p), p_2 <- s'(S_1, w, p_1), ... p_{n-1} <- s'(S_{n-1}, w, p_{n-2})}, p_n <- s'(S_{n-1}, w, p_{n-1})} = {p_n | p_{n-1} <- r'(S -> S_0 ... S_{n-2}, w, p), p_n <- s'(S_{n-1}, w, p_{n-1})} \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = {p_n | p_{n-1} <- r'(S -> S_0 ... S_{n-2}, w, p), p_n <- s'(S_{n-1}, w, p_{n-1})} = u(m((p +-> s'(S_{n-1}, w, p)), r'(S -> S_0 ... S_{n-2}, w, p))) \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), r'(S -> S_0 ... S_{n-3}, w, p))))) = ... = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), r'(S -> S_0, w, p))))))) \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), r'(S -> S_0, w, p))))))) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), s'(S_0, w, p))))))) \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), s'(S_0, w, p))))))) \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} \{f(x)\} = u(\{\{f(x)\}\}) = u(m(f, \{x\})). \end{equation*} \begin{verbatim} {f(x)} = u({{f(x)}}) = u(m(f, {x})). r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), s'(S_0, w, p))))))) r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), u(m((p +-> s'(S_0, w, p)), {p})))))))). \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{verbatim} r'(S -> S_0 ... S_{n-1}, w, p) = u(m((p +-> s'(S_{n-1}, w, p)), u(m((p +-> s'(S_{n-2}, w, p)), ... u(m((p +-> s'(S_1, w, p)), u(m((p +-> s'(S_0, w, p)), {p})))))))). \end{verbatim} \end{frame} \begin{frame}[fragile]{Combinators} \begin{equation*} z_i(x) := u(m((p \mapsto s'(S_i, w, p)), x)) \end{equation*} \begin{equation*} r'(S \rightarrow S_0 ... S_{n-1}, w, p) = (z_{n-1} \circ ... \circ z_0)({p}). \end{equation*} \end{frame} %% \begin{frame}[fragile]{Combinators} %% \begin{multline*} %% r'(S \rightarrow S_0 ... S_{n-1}, w, p) \\ %% % %% = \{p_n | p_1 \in s'(S_0, w, p), \\ %% p_2 \in s'(S_1, w, p_1), \\ %% ... \\ %% p_n \in s'(S_{n-1}, w, p_{n-1})\} %% % %% = %% \begin{aligned} %% \{p_n | & p_1 \in s'(S_0, w, p), \\ %% & p_2 \in s'(S_1, w, p_1), \\ %% & ... \\ %% & p_n \in s'(S_{n-1}, w, p_{n-1})\} %% \end{aligned} %% % %% = u(m((p \mapsto s'(S_{n-1}, w, p)), %% \{p_{n-1} | p_1 \in s'(S_0, w, p), %% p_2 \in s'(S_1, w, p_1) %% ... %% p_{n-1} \in s'(S_{n-2}, w, p_{n-2})\})) %% % %% = u(m((p \mapsto s'(S_{n-1}, w, p)), %% \{p_{n-1} | p_1 \in s'(S_0, w, p), %% p_2 \in s'(S_1, w, p_1) %% ... %% p_{n-1} \in s'(S_{n-2}, w, p_{n-2})\})) %% % %% = u(m((p \mapsto s'(S_{n-1}, w, p)), \\ %% u(m((p \mapsto s'(S_{n-2}, w, p)), \\ %% ... \\ %% u(m((p \mapsto s'(S_1, w, p)), \\ %% s'(S_0, w, p))))))) \\ %% \end{multline*} %% \end{frame} \begin{frame}[fragile]{Combinators} Rule: $S \rightarrow S_0 ... S_{n-1}$. \\ Recognizer: $r'' = s''_0 * ... * s''_{n-1}$. \\ Recognizer for symbols combines all respective rules: \\ $s = r''_0 + ... + r''_{m-1}.$ \end{frame} \begin{frame}[fragile]{Combinators} If mapping symbols to sets of symbols: \begin{eqnarray*} (s''_A * s''_B)(p) = u(m(s''_B, s''_A(p))) = u(m(s''_B, u(m(s''_A, \{p\})))). \\ (r''_1 + r''_2)(p) = u(\{r''_1(p), r''_2(p)\}) = r''_A(p) \cup r''_B(p). \end{eqnarray*} If mapping sets of symbols to sets of symbols: \begin{eqnarray*} (s''_A * s''_B)(P) = u(m(s''_B, u(m(s''_A, P)))). \\ (r''_1 + r''_2)(P) = u(\{r''_1(p), r''_2(p)\}) = m(r''_1, P) \cup m(r''_2, P). \end{eqnarray*} \end{frame} \begin{frame}[fragile]{Monads} $\{p\}$ is ``unit'', $m$ is ``map'', $u$ is ``union''. Haskell does it alternatively: \begin{equation*} (x >>= f) = u(m(f, x)) \end{equation*} \end{frame} \begin{frame}[fragile]{LL(1)} Predicting the rule using lookahead information. \pause LL(0) is either empty or single sentence. \pause LL(1) \end{frame} \begin{frame}[fragile]{LL(1)} \begin{equation*} s'(S, w, p) := r'(R, w, p), \quad {\rm where~} R = c(S, w[p]). \end{equation*} \begin{verbatim} S -> S_0 ... S_{n-1} ... S' -> ... S S{k+1} ... S_{m-1} \end{verbatim} \end{frame} \begin{frame}[fragile]{LL(1)} \begin{verbatim} S -> S_0 ... S_{n-1} ... S' -> ... S S{k+1} ... S_{m-1} \end{verbatim} %% \begin{equation*} %% w[p] \in {\rm FIRST}(S_0 ... S_{n-1}) \vee ({\rm NULLABLE}(S_0 ... S_{n-1}) \wedge w[p] \in {\rm FOLLOW}(S')) %% \end{equation*} \begin{equation*} w[p] \in {\rm FIRST}(S_0 ... S_{n-1}) \end{equation*} \begin{equation*} {\rm NULLABLE}(S_0 ... S_{n-1}) \wedge w[p] \in {\rm FOLLOW}(S') \end{equation*} \end{frame} \begin{frame}[fragile]{``Nullable''} \begin{eqnarray*} {\rm NULLABLE}(\_) \\ {\rm NULLABLE}(S_0 ... S_{n-1}) \Leftrightarrow \forall i {\rm NULLABLE}(S_i) \\ {\rm NULLABLE}(R) \Leftrightarrow {\rm NULLABLE}({\rm rhs}(R)) \\ {\rm NULLABLE}(S) \Leftrightarrow \exists R (R = S \rightarrow ...) \Rightarrow {\rm NULLABLE}(R) \end{eqnarray*} \end{frame} \begin{frame}[fragile]{``First''} \begin{eqnarray*} {\rm FIRST}(\_) = \{\} \\ {\rm FIRST}(S_0 ... S_{n-1}) \Leftrightarrow ... \\ {\rm FIRST}(S) \Leftrightarrow \bigcup {\rm FIRST}({\rm rhs}(R)) \\ \end{eqnarray*} \end{frame} \begin{frame}[fragile]{``Follow''} \begin{eqnarray*} {\rm FOLLOW}(S) = \bigcup_R ({\rm FIRST}(S_{k+1} ... S_{m-1}) \cup {\rm FOLLOW}({\rm lhs}(R)) \end{eqnarray*} \end{frame} \begin{frame}[fragile]{``Predict''} \begin{eqnarray*} {\rm PREDICT}(R) & = & {\rm FIRST}(R) \cup {\rm FOLLOW}({\rm lhs}(R)), \quad {\rm when~} {\rm NULLABLE}(R) \\ {\rm PREDICT}(R) & = & {\rm FIRST}(R), \quad {\rm otherwise} \end{eqnarray*} \end{frame} \begin{frame}[fragile]{LL(1)} At input string termination we choose ${\rm NULLABLE}$ rules. \end{frame} \end{document}