tooling.Tokenizer source code

1 // Copyright (c) 2014- Facebook
2 // License: Boost License 1.0, http://boost.org/LICENSE_1_0.txt
3 // @author Andrei Alexandrescu (andrei.alexandrescu@facebook.com)
4 
5 module tooling.Tokenizer;
6 
7 import std.algorithm, std.array, std.ascii, std.conv, std.exception, std.regex,
8   std.stdio, std.typecons, std.typetuple;
9 
10 struct TokenizerGenerator(alias tokens, alias reservedTokens) {
11   /**
12    * All token types include regular and reservedTokens, plus the null
13    * token ("") and the end-of-stream token ("\0").
14    */
15   private enum totalTokens = tokens.length + reservedTokens.length + 2;
16 
17   /**
18    * Representation for token types.
19    */
20   static if (totalTokens <= ubyte.max)
21     alias TokenIDRep = ubyte;
22   else static if (totalTokens <= ushort.max)
23     alias TokenIDRep = ushort;
24   else // seriously?
25     alias TokenIDRep = uint;
26 
27   struct TokenType2 {
28     TokenIDRep id;
29 
30     string sym() const {
31       assert(id <= tokens.length + reservedTokens.length + 2, text(id));
32       if (id == 0) return "";
33       auto i = id - 1;
34       if (i >= tokens.length + reservedTokens.length) return "\0";
35       return i < tokens.length
36         ? tokens[i]
37         : reservedTokens[i - tokens.length];
38     }
39   }
40 
41   template tk(string symbol) {
42     import std.range;
43     static if (symbol == "") {
44       // Token ID 0 is reserved for "unrecognized token".
45       enum tk = TokenType2(0);
46     } else static if (symbol == "\0") {
47       // Token ID max is reserved for "end of input".
48         enum tk = TokenType2(
49           cast(TokenIDRep) (1 + tokens.length + reservedTokens.length));
50     } else {
51         //enum id = chain(tokens, reservedTokens).countUntil(symbol);
52       // Find the id within the regular tokens realm
53       enum idTokens = tokens.countUntil(symbol);
54       static if (idTokens >= 0) {
55         // Found, regular token. Add 1 because 0 is reserved.
56         enum id = idTokens + 1;
57       } else {
58         // not found, only chance is within the reserved tokens realm
59         enum idResTokens = reservedTokens.countUntil(symbol);
60         enum id = idResTokens >= 0 ? tokens.length + idResTokens + 1 : -1;
61       }
62       static assert(id >= 0 && id < TokenIDRep.max,
63                     "Invalid token: " ~ symbol);
64       enum tk = TokenType2(cast(ubyte) id);
65     }
66   }
67 
68 /**
69  * Defines one token together with file and line information. The
70  * precedingComment_ is set if there was one comment before the token.
71  */
72   struct Token {
73     TokenType2 type_;
74     string value_;
75     string precedingWhitespace_;
76     size_t line_;
77     string file_;
78 
79     string value() const {
80       return value_ ? value_ : type_.sym;
81     }
82   };
83 
84   static string generateCases(string[] tokens, size_t index = 0,
85                              bool* mayFallThrough = null) {
86     assert(tokens.length > 1);
87 
88     static bool mustEscape(char c) {
89       return c == '\\' || c == '"' || c == '\'';
90     }
91 
92     static string escape(string s) {
93       string result;
94       foreach (c; s) {
95         if (mustEscape(c)) result ~= "\\";
96         result ~= c;
97       }
98       return result;
99     }
100 
101     string result;
102     for (size_t i = 0; i < tokens.length; ++i) {
103       if (index >= tokens[i].length) {
104         result ~= "default: t = tk!\""
105           ~ tokens[i] ~ "\"; break token_search;\n";
106       } else {
107         result ~= "case '" ~ escape(tokens[i][index .. index + 1]) ~ "': ";
108         auto j = i + 1;
109         while (j < tokens.length && tokens[j][index] == tokens[i][index]) {
110           ++j;
111         }
112         // i through j-1 are tokens with the same prefix
113         if (j == i + 1) {
114           if (tokens[i].length > index + 1) {
115             result ~= "if (";
116             result ~= tokens[i].length == index + 2
117               ? ("pc["~to!string(index + 1)~"] == '"
118                  ~ escape(tokens[i][index + 1 .. index + 2]) ~ "') ")
119               : ("pc["~to!string(index + 1)~" .. $].startsWith(\""
120                  ~ escape(tokens[i][index + 1 .. $]) ~ "\")) ");
121             result ~= "{ t = tk!\""
122               ~ escape(tokens[i]) ~
123               "\"; break token_search; } else break;\n";
124             if (mayFallThrough) *mayFallThrough = true;
125           } else {
126             result ~= "t = tk!\"" ~ escape(tokens[i])
127               ~ "\"; break token_search;\n";
128           }
129           continue;
130         }
131         auto endOfToken = false;
132         foreach (k; i .. j) {
133           if (index + 1 >= tokens[k].length) {
134             endOfToken = true;
135             break;
136           }
137         }
138         result ~= "switch (pc["~to!string(index + 1)~"]) {\n";
139         if (!endOfToken) result ~= "default: break;\n";
140         bool mft;
141         result ~= generateCases(tokens[i .. j], index + 1, &mft)
142           ~ "}";
143         if (!endOfToken || mft) {
144           result ~= " break;\n";
145           if (mayFallThrough) *mayFallThrough = true;
146         }
147         else {
148           result ~= "\n";
149         }
150         i = j - 1;
151       }
152     }
153     return result;
154   }
155 
156 //pragma(msg, generateCases([NonAlphaTokens, Keywords]));
157 
158   static Tuple!(size_t, size_t, TokenType2)
159   match(R)(R r) {
160     size_t charsBefore = 0, linesBefore = 0;
161     TokenType2 t;
162     auto pc = r;
163 
164     token_search:
165     for (;;) {
166       assert(pc.length);
167       switch (pc[0]) {
168         case '\n':
169           ++linesBefore;
170           goto case; // fall through to also increment charsBefore
171         case ' ': case '\t': case '\r':
172           ++charsBefore;
173           pc = pc[1 .. $];
174           continue;
175         case '\0':
176           // done!
177           t = tk!"\0";
178           break token_search;
179         default:
180           break;
181           // Bunch of cases generated here
182           mixin(generateCases(tokens));
183       }
184       // Couldn't match any token
185       t = tk!"";
186       break;
187     }
188     return tuple(linesBefore, charsBefore, t);
189   }
190 }
191 
192 alias NonAlphaTokens = TypeTuple!(
193   "~", "(", ")", "[", "]", "{", "}", ";", ",", "?",
194   "<", "<=", "<<", "<<=", ">", ">=", ">>", ">>=", "%", "%=", "=", "==",
195   "!", "!=", "^", "^=", "*", "*=",
196   ":", "::", "+", "++", "+=", "&", "&&", "&=", "|", "||", "|=",
197   "-", "--", "-=", "->", "->*",
198   "/", "/=", "//", "/*",
199   "\\",
200   ".", ".*", "...",
201   "'",
202   "\"",
203   "#", "##",
204   "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
205   "@", "$", // allow as extensions
206   "R\"(",
207 );
208 
209 alias Keywords = TypeTuple!(
210   "and", "and_eq", "asm", "auto",
211   "bitand", "bitor", "bool", "break",
212   "case", "catch", "char", "class", "compl", "const", "const_cast", "constexpr",
213     "continue",
214   "default", "delete", "do", "double", "dynamic_cast",
215   "else", "enum", "explicit", "extern", "false", "float", "for", "friend",
216   "goto",
217   "if", "inline", "int",
218   "long",
219   "mutable",
220   "namespace", "new", "noexcept", "not", "not_eq",
221   "operator", "or", "or_eq",
222   "private", "protected", "public",
223   "register", "reinterpret_cast", "return",
224   "short", "signed", "sizeof", "static", "static_cast", "struct", "switch",
225   "template", "this", "throw", "true", "try", "typedef", "typeid", "typename",
226   "union", "unsigned", "using",
227   "virtual", "void", "volatile",
228   "wchar_t", "while",
229   "xor", "xor_eq",
230 );
231 
232 static immutable string[] specialTokens = [
233   "identifier", "number", "string_literal", "char_literal",
234   "preprocessor_directive"
235 ];
236 
237 alias TokenizerGenerator!([NonAlphaTokens, Keywords], specialTokens)
238   CppLexer;
239 alias tk = CppLexer.tk;
240 alias Token = CppLexer.Token;
241 alias TokenType = CppLexer.TokenType2;
242 
243 /**
244  * This is the quintessential function. Given a string containing C++
245  * code and a filename, fills output with the tokens in the
246  * file. Warning - don't use temporaries for input and filename
247  * because the resulting tokens contain StringPiece objects pointing
248  * into them.
249  */
250 CppLexer.Token[] tokenize(string input, string initialFilename = null) {
251   input ~= '\0'; // TODO revisit this
252   CppLexer.Token[] output;
253   auto file = initialFilename;
254   size_t line = 1;
255 
256   for (;;) {
257     auto t = nextToken(input, line);
258     t.file_ = initialFilename;
259     //writeln(t);
260     output ~= t;
261     if (t.type_ is CppLexer.tk!"\0") break;
262   }
263 
264   return output;
265 }
266 
267 void tokenize(string input, string initialFilename, ref CppLexer.Token[] t) {
268   t = tokenize(input, initialFilename);
269 }
270 
271 /**
272  * Helper function, gets next token and updates pc and line.
273  */
274 CppLexer.Token nextToken(ref string pc, ref size_t line) {
275   size_t charsBefore;
276   string value;
277   CppLexer.TokenType2 tt;
278   auto initialPc = pc;
279   auto initialLine = line;
280   size_t tokenLine;
281 
282   for (;;) {
283     auto t = CppLexer.match(pc);
284     line += t[0];
285     tokenLine = line;
286     charsBefore += t[1];
287     tt = t[2];
288 
289     // Position pc to advance past the leading whitespace
290     pc = pc[t[1] .. $];
291 
292     // Unrecognized token?
293     if (tt is tk!"") {
294       auto c = pc[0];
295       if (std.ascii.isAlpha(c) || c == '_' || c == '$' || c == '@') {
296         value = munchIdentifier(pc);
297         //writeln("sym: ", value);
298         tt = tk!"identifier";
299         break;
300       } else {
301         writeln("Illegal character: ", cast(uint) c, " [", c, "]");
302         throw new Exception("Illegal character");
303       }
304     }
305 
306     assert(tt.sym.length >= 1, "'" ~ tt.sym ~ "'");
307     // Number?
308     if (isDigit(tt.sym[0]) || tt is tk!"." && isDigit(pc[1])) {
309       tt = tk!"number";
310       value = munchNumber(pc);
311       break;
312     }
313 
314     // Single-line comment?
315     if (tt is tk!"//") {
316       charsBefore += munchSingleLineComment(pc, line).length;
317       continue;
318     }
319 
320     // Multi-line comment?
321     if (tt is tk!"/*") {
322       charsBefore += munchComment(pc, line).length;
323       continue;
324     }
325 
326     // #pragma/#error/#warning preprocessor directive (except #pragma once)?
327     if (tt == tk!"#" && match(pc, ctRegex!(`^#\s*(error|warning|pragma)\s`))
328           && !match(pc, ctRegex!(`^#\s*pragma\s+once`))) {
329       value = munchPreprocessorDirective(pc, line);
330       tt = tk!"preprocessor_directive";
331       break;
332     }
333 
334     // Literal string?
335     if (tt is tk!"\"") {
336       value = munchString(pc, line);
337       tt = tk!"string_literal";
338       break;
339     }
340 
341     //Raw string?
342     if (tt is tk!"R\"(") {
343       value = munchRawString(pc, line);
344       tt = tk!"string_literal";
345       break;
346     }
347 
348     // Literal char?
349     if (tt is tk!"'") {
350       value = munchCharLiteral(pc, line);
351       tt = tk!"char_literal";
352       break;
353     }
354 
355     // Keyword or identifier?
356     char c = tt.sym[0];
357     if (std.ascii.isAlpha(c) || c == '_' || c == '$' || c == '@') {
358       // This is a keyword, but it may be a prefix of a longer symbol
359       assert(pc.length >= tt.sym.length, text(tt.sym, ": ", pc));
360       c = pc[tt.sym.length];
361       if (isAlphaNum(c) || c == '_' || c == '$' || c == '@') {
362         // yep, longer symbol
363         value = munchIdentifier(pc);
364         // writeln("sym: ", tt.symbol, symbol);
365         tt = tk!"identifier";
366       } else {
367         // Keyword, just update the pc
368         pc = pc[tt.sym.length .. $];
369       }
370       break;
371     }
372 
373     // End of stream?
374     if (tt is tk!"\0") {
375       break;
376     }
377 
378     // We have something!
379     pc = pc[tt.sym.length .. $];
380     break;
381   }
382 
383   version (unittest) {
384     // make sure the we munched the right number of characters
385     auto delta = initialPc.length - pc.length;
386     if (tt is tk!"\0") delta += 1;
387     auto tsz = charsBefore + (value ? value.length : tt.sym().length);
388     if (tsz != delta) {
389       stderr.writeln("Flint tokenization error: Wrong size for token type '",
390           tt.sym(), "': '", initialPc[0 .. charsBefore], "'~'", value, "' ",
391           "of size ", tsz, " != '", initialPc[0 .. delta], "' of size ",
392           delta);
393       throw new Exception("Internal flint error");
394     }
395 
396     // make sure that line was incremented the correct number of times
397     auto lskip = std.algorithm.count(initialPc[0 .. delta], '\n');
398     if (initialLine + lskip != line) {
399       stderr.writeln("Flint tokenization error: muched '",
400           initialPc[0 .. delta], "' (token type '", tt.sym(), "'), "
401           "which contains ", lskip, " newlines, "
402           "but line has been incremented by ", line - initialLine);
403       throw new Exception("Internal flint error");
404     }
405   }
406 
407   return CppLexer.Token(
408     tt, value,
409     initialPc[0 .. charsBefore],
410     tokenLine);
411 }
412 
413 /**
414  * Eats howMany characters out of pc, avances pc appropriately, and
415  * returns the eaten portion.
416  */
417 static string munchChars(ref string pc, size_t howMany) {
418   assert(pc.length >= howMany);
419   auto result = pc[0 .. howMany];
420   pc = pc[howMany .. $];
421   return result;
422 }
423 
424 /**
425  * Assuming pc is positioned at the start of a single-line comment,
426  * munches it from pc and returns it.
427  */
428 static string munchSingleLineComment(ref string pc, ref size_t line) {
429   for (size_t i = 0; ; ++i) {
430     assert(i < pc.length);
431     auto c = pc[i];
432     if (c == '\n') {
433       ++line;
434       if (i > 0 && pc[i - 1] == '\\') {
435         // multiline single-line comment (sic)
436         continue;
437       }
438       // end of comment
439       return munchChars(pc, i + 1);
440     }
441     if (!c) {
442       // single-line comment at end of file, meh
443       return munchChars(pc, i);
444     }
445   }
446   assert(false);
447 }
448 
449 /**
450  * Assuming pc is positioned at the start of a C-style comment,
451  * munches it from pc and returns it.
452  */
453 static string munchComment(ref string pc, ref size_t line) {
454   //assert(pc[0] == '/' && pc[1] == '*');
455   for (size_t i = 0; ; ++i) {
456     assert(i < pc.length);
457     auto c = pc[i];
458     if (c == '\n') {
459       ++line;
460     }
461     else if (c == '*') {
462       if (pc[i + 1] == '/') {
463         // end of comment
464         return munchChars(pc, i + 2);
465       }
466     }
467     else if (!c) {
468       // end of input
469       enforce(false, "Unterminated comment: ", pc);
470     }
471   }
472   assert(false);
473 }
474 
475 /**
476  * Assuming pc is positioned at the start of a specified preprocessor directive,
477  * munches it from pc and returns it.
478  */
479 static string munchPreprocessorDirective(ref string pc, ref size_t line) {
480   for (size_t i = 0; ; ++i) {
481     assert(i < pc.length);
482     auto c = pc[i];
483     if (c == '\n') {
484       if (i > 0 && pc[i - 1] == '\\') {
485         // multiline directive
486         ++line;
487         continue;
488       }
489       // end of directive
490       return munchChars(pc, i);
491     }
492     if (!c) {
493       // directive at end of file
494       return munchChars(pc, i);
495     }
496   }
497 }
498 
499 /**
500  * Assuming pc is positioned at the start of an identifier, munches it
501  * from pc and returns it.
502  */
503 static string munchIdentifier(ref string pc) {
504   for (size_t i = 0; ; ++i) {
505     assert(i < pc.length);
506     const c = pc[i];
507     // g++ allows '$' in identifiers. Also, some crazy inline
508     // assembler uses '@' in identifiers, see e.g.
509     // fbcode/external/cryptopp/rijndael.cpp, line 527
510     if (!isAlphaNum(c) && c != '_' && c != '$' && c != '@') {
511       // done
512       enforce(i > 0, "Invalid identifier: ", pc);
513       return munchChars(pc, i);
514     }
515   }
516   assert(false);
517 }
518 
519 /**
520  * Assuming pc is positioned at the start of a string literal, munches
521  * it from pc and returns it. A reference to line is passed in order
522  * to track multiline strings correctly.
523  */
524 static string munchString(ref string pc, ref size_t line) {
525   assert(pc[0] == '"');
526   for (size_t i = 1; ; ++i) {
527     const c = pc[i];
528     if (c == '"') {
529       // That's about it
530       return munchChars(pc, i + 1);
531     }
532     if (c == '\\') {
533       ++i;
534       if (pc[i] == '\n') {
535         ++line;
536       }
537       continue;
538     }
539     enforce(c, "Unterminated string constant: ", pc);
540   }
541 }
542 
543 /**
544   * Assuming pc is positioned at the start og a raw string, munches
545   * it from pc and returns it.
546   */
547 static string munchRawString(ref string pc, ref size_t line) {
548   assert(pc.startsWith(`R"(`));
549   for (size_t i = 3; ; ++i) {
550     const c = pc[i];
551     if (c == ')') {
552       if (pc[i + 1] == '"') {
553         //End of a raw string literal
554         return munchChars(pc, i + 2);
555       }
556     }
557     enforce(c, "Unterminated raw string: ", pc);
558   }
559 }
560 
561 /**
562  * Assuming pc is positioned at the start of a number (be it decimal
563  * or floating-point), munches it off pc and returns it. Note that the
564  * number is assumed to be correct so a number of checks are not
565  * necessary.
566  */
567 static string munchNumber(ref string pc) {
568   bool sawDot = false, sawExp = false, sawX = false, sawSuffix = false;
569   for (size_t i = 0; ; ++i) {
570     assert(i < pc.length);
571     auto const c = pc[i];
572     if (c == '.' && !sawDot && !sawExp && !sawSuffix) {
573       sawDot = true;
574     } else if (isDigit(c)) {
575       // Nothing to do
576     } else if (sawX && !sawExp && c && "AaBbCcDdEeFf".canFind(c)) {
577       // Hex digit; nothing to do. The condition includes !sawExp
578       // because the exponent is decimal even in a hex floating-point
579       // number!
580     } else if (c == '+' || c == '-') {
581       // Sign may appear at the start or right after E or P
582       if (i > 0 && !"EePp".canFind(pc[i - 1])) {
583         // Done, the sign is the next token
584         return munchChars(pc, i);
585       }
586     } else if (!sawExp && !sawSuffix && !sawX && (c == 'e' || c == 'E')) {
587       sawExp = true;
588     } else if (sawX && !sawExp && !sawSuffix && (c == 'p' || c == 'P')) {
589       sawExp = true;
590     } else if ((c == 'x' || c == 'X') && i == 1 && pc[0] == '0') {
591       sawX = true;
592     } else if (c && "FfLlUu".canFind(c)) {
593       // It's a suffix. There could be several of them (including
594       // repeats a la LL), so let's not return just yet
595       sawSuffix = true;
596     } else {
597       // done
598       enforce(i > 0, "Invalid number: ", pc);
599       return munchChars(pc, i);
600     }
601   }
602   assert(false);
603 }
604 
605 /**
606  * Assuming pc is positioned at the start of a character literal,
607  * munches it from pc and returns it. A reference to line is passed in
608  * order to track multiline character literals (yeah, that can
609  * actually happen) correctly.
610  */
611 static string munchCharLiteral(ref string pc, ref size_t line) {
612   assert(pc[0] == '\'');
613   for (size_t i = 1; ; ++i) {
614     auto const c = pc[i];
615     if (c == '\n') {
616       ++line;
617     }
618     if (c == '\'') {
619       // That's about it
620       return munchChars(pc, i + 1);
621     }
622     if (c == '\\') {
623       ++i;
624       continue;
625     }
626     enforce(c, "Unterminated character constant: ", pc);
627   }
628 }