1 // Copyright (c) 2014- Facebook 2 // License: Boost License 1.0, http://boost.org/LICENSE_1_0.txt 3 // @author Andrei Alexandrescu (andrei.alexandrescu@facebook.com) 4 5 module tooling.Tokenizer; 6 7 import std.algorithm, std.array, std.ascii, std.conv, std.exception, std.regex, 8 std.stdio, std.typecons, std.typetuple; 9 10 struct TokenizerGenerator(alias tokens, alias reservedTokens) { 11 /** 12 * All token types include regular and reservedTokens, plus the null 13 * token ("") and the end-of-stream token ("\0"). 14 */ 15 private enum totalTokens = tokens.length + reservedTokens.length + 2; 16 17 /** 18 * Representation for token types. 19 */ 20 static if (totalTokens <= ubyte.max) 21 alias TokenIDRep = ubyte; 22 else static if (totalTokens <= ushort.max) 23 alias TokenIDRep = ushort; 24 else // seriously? 25 alias TokenIDRep = uint; 26 27 struct TokenType2 { 28 TokenIDRep id; 29 30 string sym() const { 31 assert(id <= tokens.length + reservedTokens.length + 2, text(id)); 32 if (id == 0) return ""; 33 auto i = id - 1; 34 if (i >= tokens.length + reservedTokens.length) return "\0"; 35 return i < tokens.length 36 ? tokens[i] 37 : reservedTokens[i - tokens.length]; 38 } 39 } 40 41 template tk(string symbol) { 42 import std.range; 43 static if (symbol == "") { 44 // Token ID 0 is reserved for "unrecognized token". 45 enum tk = TokenType2(0); 46 } else static if (symbol == "\0") { 47 // Token ID max is reserved for "end of input". 48 enum tk = TokenType2( 49 cast(TokenIDRep) (1 + tokens.length + reservedTokens.length)); 50 } else { 51 //enum id = chain(tokens, reservedTokens).countUntil(symbol); 52 // Find the id within the regular tokens realm 53 enum idTokens = tokens.countUntil(symbol); 54 static if (idTokens >= 0) { 55 // Found, regular token. Add 1 because 0 is reserved. 56 enum id = idTokens + 1; 57 } else { 58 // not found, only chance is within the reserved tokens realm 59 enum idResTokens = reservedTokens.countUntil(symbol); 60 enum id = idResTokens >= 0 ? tokens.length + idResTokens + 1 : -1; 61 } 62 static assert(id >= 0 && id < TokenIDRep.max, 63 "Invalid token: " ~ symbol); 64 enum tk = TokenType2(cast(ubyte) id); 65 } 66 } 67 68 /** 69 * Defines one token together with file and line information. The 70 * precedingComment_ is set if there was one comment before the token. 71 */ 72 struct Token { 73 TokenType2 type_; 74 string value_; 75 string precedingWhitespace_; 76 size_t line_; 77 string file_; 78 79 string value() const { 80 return value_ ? value_ : type_.sym; 81 } 82 }; 83 84 static string generateCases(string[] tokens, size_t index = 0, 85 bool* mayFallThrough = null) { 86 assert(tokens.length > 1); 87 88 static bool mustEscape(char c) { 89 return c == '\\' || c == '"' || c == '\''; 90 } 91 92 static string escape(string s) { 93 string result; 94 foreach (c; s) { 95 if (mustEscape(c)) result ~= "\\"; 96 result ~= c; 97 } 98 return result; 99 } 100 101 string result; 102 for (size_t i = 0; i < tokens.length; ++i) { 103 if (index >= tokens[i].length) { 104 result ~= "default: t = tk!\"" 105 ~ tokens[i] ~ "\"; break token_search;\n"; 106 } else { 107 result ~= "case '" ~ escape(tokens[i][index .. index + 1]) ~ "': "; 108 auto j = i + 1; 109 while (j < tokens.length && tokens[j][index] == tokens[i][index]) { 110 ++j; 111 } 112 // i through j-1 are tokens with the same prefix 113 if (j == i + 1) { 114 if (tokens[i].length > index + 1) { 115 result ~= "if ("; 116 result ~= tokens[i].length == index + 2 117 ? ("pc["~to!string(index + 1)~"] == '" 118 ~ escape(tokens[i][index + 1 .. index + 2]) ~ "') ") 119 : ("pc["~to!string(index + 1)~" .. $].startsWith(\"" 120 ~ escape(tokens[i][index + 1 .. $]) ~ "\")) "); 121 result ~= "{ t = tk!\"" 122 ~ escape(tokens[i]) ~ 123 "\"; break token_search; } else break;\n"; 124 if (mayFallThrough) *mayFallThrough = true; 125 } else { 126 result ~= "t = tk!\"" ~ escape(tokens[i]) 127 ~ "\"; break token_search;\n"; 128 } 129 continue; 130 } 131 auto endOfToken = false; 132 foreach (k; i .. j) { 133 if (index + 1 >= tokens[k].length) { 134 endOfToken = true; 135 break; 136 } 137 } 138 result ~= "switch (pc["~to!string(index + 1)~"]) {\n"; 139 if (!endOfToken) result ~= "default: break;\n"; 140 bool mft; 141 result ~= generateCases(tokens[i .. j], index + 1, &mft) 142 ~ "}"; 143 if (!endOfToken || mft) { 144 result ~= " break;\n"; 145 if (mayFallThrough) *mayFallThrough = true; 146 } 147 else { 148 result ~= "\n"; 149 } 150 i = j - 1; 151 } 152 } 153 return result; 154 } 155 156 //pragma(msg, generateCases([NonAlphaTokens, Keywords])); 157 158 static Tuple!(size_t, size_t, TokenType2) 159 match(R)(R r) { 160 size_t charsBefore = 0, linesBefore = 0; 161 TokenType2 t; 162 auto pc = r; 163 164 token_search: 165 for (;;) { 166 assert(pc.length); 167 switch (pc[0]) { 168 case '\n': 169 ++linesBefore; 170 goto case; // fall through to also increment charsBefore 171 case ' ': case '\t': case '\r': 172 ++charsBefore; 173 pc = pc[1 .. $]; 174 continue; 175 case '\0': 176 // done! 177 t = tk!"\0"; 178 break token_search; 179 default: 180 break; 181 // Bunch of cases generated here 182 mixin(generateCases(tokens)); 183 } 184 // Couldn't match any token 185 t = tk!""; 186 break; 187 } 188 return tuple(linesBefore, charsBefore, t); 189 } 190 } 191 192 alias NonAlphaTokens = TypeTuple!( 193 "~", "(", ")", "[", "]", "{", "}", ";", ",", "?", 194 "<", "<=", "<<", "<<=", ">", ">=", ">>", ">>=", "%", "%=", "=", "==", 195 "!", "!=", "^", "^=", "*", "*=", 196 ":", "::", "+", "++", "+=", "&", "&&", "&=", "|", "||", "|=", 197 "-", "--", "-=", "->", "->*", 198 "/", "/=", "//", "/*", 199 "\\", 200 ".", ".*", "...", 201 "'", 202 "\"", 203 "#", "##", 204 "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 205 "@", "$", // allow as extensions 206 "R\"(", 207 ); 208 209 alias Keywords = TypeTuple!( 210 "and", "and_eq", "asm", "auto", 211 "bitand", "bitor", "bool", "break", 212 "case", "catch", "char", "class", "compl", "const", "const_cast", "constexpr", 213 "continue", 214 "default", "delete", "do", "double", "dynamic_cast", 215 "else", "enum", "explicit", "extern", "false", "float", "for", "friend", 216 "goto", 217 "if", "inline", "int", 218 "long", 219 "mutable", 220 "namespace", "new", "noexcept", "not", "not_eq", 221 "operator", "or", "or_eq", 222 "private", "protected", "public", 223 "register", "reinterpret_cast", "return", 224 "short", "signed", "sizeof", "static", "static_cast", "struct", "switch", 225 "template", "this", "throw", "true", "try", "typedef", "typeid", "typename", 226 "union", "unsigned", "using", 227 "virtual", "void", "volatile", 228 "wchar_t", "while", 229 "xor", "xor_eq", 230 ); 231 232 static immutable string[] specialTokens = [ 233 "identifier", "number", "string_literal", "char_literal", 234 "preprocessor_directive" 235 ]; 236 237 alias TokenizerGenerator!([NonAlphaTokens, Keywords], specialTokens) 238 CppLexer; 239 alias tk = CppLexer.tk; 240 alias Token = CppLexer.Token; 241 alias TokenType = CppLexer.TokenType2; 242 243 /** 244 * This is the quintessential function. Given a string containing C++ 245 * code and a filename, fills output with the tokens in the 246 * file. Warning - don't use temporaries for input and filename 247 * because the resulting tokens contain StringPiece objects pointing 248 * into them. 249 */ 250 CppLexer.Token[] tokenize(string input, string initialFilename = null) { 251 input ~= '\0'; // TODO revisit this 252 CppLexer.Token[] output; 253 auto file = initialFilename; 254 size_t line = 1; 255 256 for (;;) { 257 auto t = nextToken(input, line); 258 t.file_ = initialFilename; 259 //writeln(t); 260 output ~= t; 261 if (t.type_ is CppLexer.tk!"\0") break; 262 } 263 264 return output; 265 } 266 267 void tokenize(string input, string initialFilename, ref CppLexer.Token[] t) { 268 t = tokenize(input, initialFilename); 269 } 270 271 /** 272 * Helper function, gets next token and updates pc and line. 273 */ 274 CppLexer.Token nextToken(ref string pc, ref size_t line) { 275 size_t charsBefore; 276 string value; 277 CppLexer.TokenType2 tt; 278 auto initialPc = pc; 279 auto initialLine = line; 280 size_t tokenLine; 281 282 for (;;) { 283 auto t = CppLexer.match(pc); 284 line += t[0]; 285 tokenLine = line; 286 charsBefore += t[1]; 287 tt = t[2]; 288 289 // Position pc to advance past the leading whitespace 290 pc = pc[t[1] .. $]; 291 292 // Unrecognized token? 293 if (tt is tk!"") { 294 auto c = pc[0]; 295 if (std.ascii.isAlpha(c) || c == '_' || c == '$' || c == '@') { 296 value = munchIdentifier(pc); 297 //writeln("sym: ", value); 298 tt = tk!"identifier"; 299 break; 300 } else { 301 writeln("Illegal character: ", cast(uint) c, " [", c, "]"); 302 throw new Exception("Illegal character"); 303 } 304 } 305 306 assert(tt.sym.length >= 1, "'" ~ tt.sym ~ "'"); 307 // Number? 308 if (isDigit(tt.sym[0]) || tt is tk!"." && isDigit(pc[1])) { 309 tt = tk!"number"; 310 value = munchNumber(pc); 311 break; 312 } 313 314 // Single-line comment? 315 if (tt is tk!"//") { 316 charsBefore += munchSingleLineComment(pc, line).length; 317 continue; 318 } 319 320 // Multi-line comment? 321 if (tt is tk!"/*") { 322 charsBefore += munchComment(pc, line).length; 323 continue; 324 } 325 326 // #pragma/#error/#warning preprocessor directive (except #pragma once)? 327 if (tt == tk!"#" && match(pc, ctRegex!(`^#\s*(error|warning|pragma)\s`)) 328 && !match(pc, ctRegex!(`^#\s*pragma\s+once`))) { 329 value = munchPreprocessorDirective(pc, line); 330 tt = tk!"preprocessor_directive"; 331 break; 332 } 333 334 // Literal string? 335 if (tt is tk!"\"") { 336 value = munchString(pc, line); 337 tt = tk!"string_literal"; 338 break; 339 } 340 341 //Raw string? 342 if (tt is tk!"R\"(") { 343 value = munchRawString(pc, line); 344 tt = tk!"string_literal"; 345 break; 346 } 347 348 // Literal char? 349 if (tt is tk!"'") { 350 value = munchCharLiteral(pc, line); 351 tt = tk!"char_literal"; 352 break; 353 } 354 355 // Keyword or identifier? 356 char c = tt.sym[0]; 357 if (std.ascii.isAlpha(c) || c == '_' || c == '$' || c == '@') { 358 // This is a keyword, but it may be a prefix of a longer symbol 359 assert(pc.length >= tt.sym.length, text(tt.sym, ": ", pc)); 360 c = pc[tt.sym.length]; 361 if (isAlphaNum(c) || c == '_' || c == '$' || c == '@') { 362 // yep, longer symbol 363 value = munchIdentifier(pc); 364 // writeln("sym: ", tt.symbol, symbol); 365 tt = tk!"identifier"; 366 } else { 367 // Keyword, just update the pc 368 pc = pc[tt.sym.length .. $]; 369 } 370 break; 371 } 372 373 // End of stream? 374 if (tt is tk!"\0") { 375 break; 376 } 377 378 // We have something! 379 pc = pc[tt.sym.length .. $]; 380 break; 381 } 382 383 version (unittest) { 384 // make sure the we munched the right number of characters 385 auto delta = initialPc.length - pc.length; 386 if (tt is tk!"\0") delta += 1; 387 auto tsz = charsBefore + (value ? value.length : tt.sym().length); 388 if (tsz != delta) { 389 stderr.writeln("Flint tokenization error: Wrong size for token type '", 390 tt.sym(), "': '", initialPc[0 .. charsBefore], "'~'", value, "' ", 391 "of size ", tsz, " != '", initialPc[0 .. delta], "' of size ", 392 delta); 393 throw new Exception("Internal flint error"); 394 } 395 396 // make sure that line was incremented the correct number of times 397 auto lskip = std.algorithm.count(initialPc[0 .. delta], '\n'); 398 if (initialLine + lskip != line) { 399 stderr.writeln("Flint tokenization error: muched '", 400 initialPc[0 .. delta], "' (token type '", tt.sym(), "'), " 401 "which contains ", lskip, " newlines, " 402 "but line has been incremented by ", line - initialLine); 403 throw new Exception("Internal flint error"); 404 } 405 } 406 407 return CppLexer.Token( 408 tt, value, 409 initialPc[0 .. charsBefore], 410 tokenLine); 411 } 412 413 /** 414 * Eats howMany characters out of pc, avances pc appropriately, and 415 * returns the eaten portion. 416 */ 417 static string munchChars(ref string pc, size_t howMany) { 418 assert(pc.length >= howMany); 419 auto result = pc[0 .. howMany]; 420 pc = pc[howMany .. $]; 421 return result; 422 } 423 424 /** 425 * Assuming pc is positioned at the start of a single-line comment, 426 * munches it from pc and returns it. 427 */ 428 static string munchSingleLineComment(ref string pc, ref size_t line) { 429 for (size_t i = 0; ; ++i) { 430 assert(i < pc.length); 431 auto c = pc[i]; 432 if (c == '\n') { 433 ++line; 434 if (i > 0 && pc[i - 1] == '\\') { 435 // multiline single-line comment (sic) 436 continue; 437 } 438 // end of comment 439 return munchChars(pc, i + 1); 440 } 441 if (!c) { 442 // single-line comment at end of file, meh 443 return munchChars(pc, i); 444 } 445 } 446 assert(false); 447 } 448 449 /** 450 * Assuming pc is positioned at the start of a C-style comment, 451 * munches it from pc and returns it. 452 */ 453 static string munchComment(ref string pc, ref size_t line) { 454 //assert(pc[0] == '/' && pc[1] == '*'); 455 for (size_t i = 0; ; ++i) { 456 assert(i < pc.length); 457 auto c = pc[i]; 458 if (c == '\n') { 459 ++line; 460 } 461 else if (c == '*') { 462 if (pc[i + 1] == '/') { 463 // end of comment 464 return munchChars(pc, i + 2); 465 } 466 } 467 else if (!c) { 468 // end of input 469 enforce(false, "Unterminated comment: ", pc); 470 } 471 } 472 assert(false); 473 } 474 475 /** 476 * Assuming pc is positioned at the start of a specified preprocessor directive, 477 * munches it from pc and returns it. 478 */ 479 static string munchPreprocessorDirective(ref string pc, ref size_t line) { 480 for (size_t i = 0; ; ++i) { 481 assert(i < pc.length); 482 auto c = pc[i]; 483 if (c == '\n') { 484 if (i > 0 && pc[i - 1] == '\\') { 485 // multiline directive 486 ++line; 487 continue; 488 } 489 // end of directive 490 return munchChars(pc, i); 491 } 492 if (!c) { 493 // directive at end of file 494 return munchChars(pc, i); 495 } 496 } 497 } 498 499 /** 500 * Assuming pc is positioned at the start of an identifier, munches it 501 * from pc and returns it. 502 */ 503 static string munchIdentifier(ref string pc) { 504 for (size_t i = 0; ; ++i) { 505 assert(i < pc.length); 506 const c = pc[i]; 507 // g++ allows '$' in identifiers. Also, some crazy inline 508 // assembler uses '@' in identifiers, see e.g. 509 // fbcode/external/cryptopp/rijndael.cpp, line 527 510 if (!isAlphaNum(c) && c != '_' && c != '$' && c != '@') { 511 // done 512 enforce(i > 0, "Invalid identifier: ", pc); 513 return munchChars(pc, i); 514 } 515 } 516 assert(false); 517 } 518 519 /** 520 * Assuming pc is positioned at the start of a string literal, munches 521 * it from pc and returns it. A reference to line is passed in order 522 * to track multiline strings correctly. 523 */ 524 static string munchString(ref string pc, ref size_t line) { 525 assert(pc[0] == '"'); 526 for (size_t i = 1; ; ++i) { 527 const c = pc[i]; 528 if (c == '"') { 529 // That's about it 530 return munchChars(pc, i + 1); 531 } 532 if (c == '\\') { 533 ++i; 534 if (pc[i] == '\n') { 535 ++line; 536 } 537 continue; 538 } 539 enforce(c, "Unterminated string constant: ", pc); 540 } 541 } 542 543 /** 544 * Assuming pc is positioned at the start og a raw string, munches 545 * it from pc and returns it. 546 */ 547 static string munchRawString(ref string pc, ref size_t line) { 548 assert(pc.startsWith(`R"(`)); 549 for (size_t i = 3; ; ++i) { 550 const c = pc[i]; 551 if (c == ')') { 552 if (pc[i + 1] == '"') { 553 //End of a raw string literal 554 return munchChars(pc, i + 2); 555 } 556 } 557 enforce(c, "Unterminated raw string: ", pc); 558 } 559 } 560 561 /** 562 * Assuming pc is positioned at the start of a number (be it decimal 563 * or floating-point), munches it off pc and returns it. Note that the 564 * number is assumed to be correct so a number of checks are not 565 * necessary. 566 */ 567 static string munchNumber(ref string pc) { 568 bool sawDot = false, sawExp = false, sawX = false, sawSuffix = false; 569 for (size_t i = 0; ; ++i) { 570 assert(i < pc.length); 571 auto const c = pc[i]; 572 if (c == '.' && !sawDot && !sawExp && !sawSuffix) { 573 sawDot = true; 574 } else if (isDigit(c)) { 575 // Nothing to do 576 } else if (sawX && !sawExp && c && "AaBbCcDdEeFf".canFind(c)) { 577 // Hex digit; nothing to do. The condition includes !sawExp 578 // because the exponent is decimal even in a hex floating-point 579 // number! 580 } else if (c == '+' || c == '-') { 581 // Sign may appear at the start or right after E or P 582 if (i > 0 && !"EePp".canFind(pc[i - 1])) { 583 // Done, the sign is the next token 584 return munchChars(pc, i); 585 } 586 } else if (!sawExp && !sawSuffix && !sawX && (c == 'e' || c == 'E')) { 587 sawExp = true; 588 } else if (sawX && !sawExp && !sawSuffix && (c == 'p' || c == 'P')) { 589 sawExp = true; 590 } else if ((c == 'x' || c == 'X') && i == 1 && pc[0] == '0') { 591 sawX = true; 592 } else if (c && "FfLlUu".canFind(c)) { 593 // It's a suffix. There could be several of them (including 594 // repeats a la LL), so let's not return just yet 595 sawSuffix = true; 596 } else { 597 // done 598 enforce(i > 0, "Invalid number: ", pc); 599 return munchChars(pc, i); 600 } 601 } 602 assert(false); 603 } 604 605 /** 606 * Assuming pc is positioned at the start of a character literal, 607 * munches it from pc and returns it. A reference to line is passed in 608 * order to track multiline character literals (yeah, that can 609 * actually happen) correctly. 610 */ 611 static string munchCharLiteral(ref string pc, ref size_t line) { 612 assert(pc[0] == '\''); 613 for (size_t i = 1; ; ++i) { 614 auto const c = pc[i]; 615 if (c == '\n') { 616 ++line; 617 } 618 if (c == '\'') { 619 // That's about it 620 return munchChars(pc, i + 1); 621 } 622 if (c == '\\') { 623 ++i; 624 continue; 625 } 626 enforce(c, "Unterminated character constant: ", pc); 627 } 628 }