scripts/genksyms/lex.l

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Lexical analysis for genksyms.
   4  * Copyright 1996, 1997 Linux International.
   5  *
   6  * New implementation contributed by Richard Henderson <rth@tamu.edu>
   7  * Based on original work by Bjorn Ekwall <bj0rn@blox.se>
   8  *
   9  * Taken from Linux modutils 2.4.22.
  10  */
  11
  12 %{
  13
  14 #include <limits.h>
  15 #include <stdlib.h>
  16 #include <string.h>
  17 #include <ctype.h>
  18
  19 #include "genksyms.h"
  20 #include "parse.tab.h"
  21
  22 /* We've got a two-level lexer here.  We let flex do basic tokenization
  23    and then we categorize those basic tokens in the second stage.  */
  24 #define YY_DECL         static int yylex1(void)
  25
  26 %}
  27
  28 IDENT                   [A-Za-z_\$][A-Za-z0-9_\$]*
  29
  30 O_INT                   0[0-7]*
  31 D_INT                   [1-9][0-9]*
  32 X_INT                   0[Xx][0-9A-Fa-f]+
  33 I_SUF                   [Uu]|[Ll]|[Uu][Ll]|[Ll][Uu]
  34 INT                     ({O_INT}|{D_INT}|{X_INT}){I_SUF}?
  35
  36 FRAC                    ([0-9]*\.[0-9]+)|([0-9]+\.)
  37 EXP                     [Ee][+-]?[0-9]+
  38 F_SUF                   [FfLl]
  39 REAL                    ({FRAC}{EXP}?{F_SUF}?)|([0-9]+{EXP}{F_SUF}?)
  40
  41 STRING                  L?\"([^\\\"]*\\.)*[^\\\"]*\"
  42 CHAR                    L?\'([^\\\']*\\.)*[^\\\']*\'
  43
  44 MC_TOKEN                ([~%^&*+=|<>/-]=)|(&&)|("||")|(->)|(<<)|(>>)
  45
  46 /* We don't do multiple input files.  */
  47 %option noyywrap
  48
  49 %option noinput
  50
  51 %%
  52
  53
  54  /* Keep track of our location in the original source files.  */
  55 ^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n     return FILENAME;
  56 ^#.*\n                                  cur_line++;
  57 \n                                      cur_line++;
  58
  59  /* Ignore all other whitespace.  */
  60 [ \t\f\v\r]+                            ;
  61
  62
  63 {STRING}                                return STRING;
  64 {CHAR}                                  return CHAR;
  65 {IDENT}                                 return IDENT;
  66
  67  /* The Pedant requires that the other C multi-character tokens be
  68     recognized as tokens.  We don't actually use them since we don't
  69     parse expressions, but we do want whitespace to be arranged
  70     around them properly.  */
  71 {MC_TOKEN}                              return OTHER;
  72 {INT}                                   return INT;
  73 {REAL}                                  return REAL;
  74
  75 "..."                                   return DOTS;
  76
  77  /* All other tokens are single characters.  */
  78 .                                       return yytext[0];
  79
  80
  81 %%
  82
  83 /* Bring in the keyword recognizer.  */
  84
  85 #include "keywords.c"
  86
  87
  88 /* Macros to append to our phrase collection list.  */
  89
  90 /*
  91  * We mark any token, that that equals to a known enumerator, as
  92  * SYM_ENUM_CONST. The parser will change this for struct and union tags later,
  93  * the only problem is struct and union members:
  94  *    enum e { a, b }; struct s { int a, b; }
  95  * but in this case, the only effect will be, that the ABI checksums become
  96  * more volatile, which is acceptable. Also, such collisions are quite rare,
  97  * so far it was only observed in include/linux/telephony.h.
  98  */
  99 #define _APP(T,L)       do {                                               \
 100                           cur_node = next_node;                            \
 101                           next_node = xmalloc(sizeof(*next_node));         \
 102                           next_node->next = cur_node;                      \
 103                           cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
 104                           cur_node->tag =                                  \
 105                             find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
 106                             SYM_ENUM_CONST : SYM_NORMAL ;                  \
 107                           cur_node->in_source_file = in_source_file;       \
 108                         } while (0)
 109
 110 #define APP             _APP(yytext, yyleng)
 111
 112
 113 /* The second stage lexer.  Here we incorporate knowledge of the state
 114    of the parser to tailor the tokens that are returned.  */
 115
 116 int
 117 yylex(void)
 118 {
 119   static enum {
 120     ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
 121     ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
 122   } lexstate = ST_NOTSTARTED;
 123
 124   static int suppress_type_lookup, dont_want_brace_phrase;
 125   static struct string_list *next_node;
 126   static char *source_file;
 127
 128   int token, count = 0;
 129   struct string_list *cur_node;
 130
 131   if (lexstate == ST_NOTSTARTED)
 132     {
 133       next_node = xmalloc(sizeof(*next_node));
 134       next_node->next = NULL;
 135       lexstate = ST_NORMAL;
 136     }
 137
 138 repeat:
 139   token = yylex1();
 140
 141   if (token == 0)
 142     return 0;
 143   else if (token == FILENAME)
 144     {
 145       char *file, *e;
 146
 147       /* Save the filename and line number for later error messages.  */
 148
 149       if (cur_filename)
 150         free(cur_filename);
 151
 152       file = strchr(yytext, '\"')+1;
 153       e = strchr(file, '\"');
 154       *e = '\0';
 155       cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
 156       cur_line = atoi(yytext+2);
 157
 158       if (!source_file) {
 159         source_file = xstrdup(cur_filename);
 160         in_source_file = 1;
 161       } else {
 162         in_source_file = (strcmp(cur_filename, source_file) == 0);
 163       }
 164
 165       goto repeat;
 166     }
 167
 168   switch (lexstate)
 169     {
 170     case ST_NORMAL:
 171       switch (token)
 172         {
 173         case IDENT:
 174           APP;
 175           {
 176             int r = is_reserved_word(yytext, yyleng);
 177             if (r >= 0)
 178               {
 179                 switch (token = r)
 180                   {
 181                   case ATTRIBUTE_KEYW:
 182                     lexstate = ST_ATTRIBUTE;
 183                     count = 0;
 184                     goto repeat;
 185                   case ASM_KEYW:
 186                     lexstate = ST_ASM;
 187                     count = 0;
 188                     goto repeat;
 189                   case TYPEOF_KEYW:
 190                     lexstate = ST_TYPEOF;
 191                     count = 0;
 192                     goto repeat;
 193
 194                   case STRUCT_KEYW:
 195                   case UNION_KEYW:
 196                   case ENUM_KEYW:
 197                     dont_want_brace_phrase = 3;
 198                     suppress_type_lookup = 2;
 199                     goto fini;
 200
 201                   case EXPORT_SYMBOL_KEYW:
 202                       goto fini;
 203
 204                   case STATIC_ASSERT_KEYW:
 205                     lexstate = ST_STATIC_ASSERT;
 206                     count = 0;
 207                     goto repeat;
 208                   }
 209               }
 210             if (!suppress_type_lookup)
 211               {
 212                 if (find_symbol(yytext, SYM_TYPEDEF, 1))
 213                   token = TYPE;
 214               }
 215           }
 216           break;
 217
 218         case '[':
 219           APP;
 220           lexstate = ST_BRACKET;
 221           count = 1;
 222           goto repeat;
 223
 224         case '{':
 225           APP;
 226           if (dont_want_brace_phrase)
 227             break;
 228           lexstate = ST_BRACE;
 229           count = 1;
 230           goto repeat;
 231
 232         case '=': case ':':
 233           APP;
 234           lexstate = ST_EXPRESSION;
 235           break;
 236
 237         case DOTS:
 238         default:
 239           APP;
 240           break;
 241         }
 242       break;
 243
 244     case ST_ATTRIBUTE:
 245       APP;
 246       switch (token)
 247         {
 248         case '(':
 249           ++count;
 250           goto repeat;
 251         case ')':
 252           if (--count == 0)
 253             {
 254               lexstate = ST_NORMAL;
 255               token = ATTRIBUTE_PHRASE;
 256               break;
 257             }
 258           goto repeat;
 259         default:
 260           goto repeat;
 261         }
 262       break;
 263
 264     case ST_ASM:
 265       APP;
 266       switch (token)
 267         {
 268         case '(':
 269           ++count;
 270           goto repeat;
 271         case ')':
 272           if (--count == 0)
 273             {
 274               lexstate = ST_NORMAL;
 275               token = ASM_PHRASE;
 276               break;
 277             }
 278           goto repeat;
 279         default:
 280           goto repeat;
 281         }
 282       break;
 283
 284     case ST_TYPEOF_1:
 285       if (token == IDENT)
 286         {
 287           if (is_reserved_word(yytext, yyleng) >= 0
 288               || find_symbol(yytext, SYM_TYPEDEF, 1))
 289             {
 290               yyless(0);
 291               unput('(');
 292               lexstate = ST_NORMAL;
 293               token = TYPEOF_KEYW;
 294               break;
 295             }
 296           _APP("(", 1);
 297         }
 298         lexstate = ST_TYPEOF;
 299         /* FALLTHRU */
 300
 301     case ST_TYPEOF:
 302       switch (token)
 303         {
 304         case '(':
 305           if ( ++count == 1 )
 306             lexstate = ST_TYPEOF_1;
 307           else
 308             APP;
 309           goto repeat;
 310         case ')':
 311           APP;
 312           if (--count == 0)
 313             {
 314               lexstate = ST_NORMAL;
 315               token = TYPEOF_PHRASE;
 316               break;
 317             }
 318           goto repeat;
 319         default:
 320           APP;
 321           goto repeat;
 322         }
 323       break;
 324
 325     case ST_BRACKET:
 326       APP;
 327       switch (token)
 328         {
 329         case '[':
 330           ++count;
 331           goto repeat;
 332         case ']':
 333           if (--count == 0)
 334             {
 335               lexstate = ST_NORMAL;
 336               token = BRACKET_PHRASE;
 337               break;
 338             }
 339           goto repeat;
 340         default:
 341           goto repeat;
 342         }
 343       break;
 344
 345     case ST_BRACE:
 346       APP;
 347       switch (token)
 348         {
 349         case '{':
 350           ++count;
 351           goto repeat;
 352         case '}':
 353           if (--count == 0)
 354             {
 355               lexstate = ST_NORMAL;
 356               token = BRACE_PHRASE;
 357               break;
 358             }
 359           goto repeat;
 360         default:
 361           goto repeat;
 362         }
 363       break;
 364
 365     case ST_EXPRESSION:
 366       switch (token)
 367         {
 368         case '(': case '[': case '{':
 369           ++count;
 370           APP;
 371           goto repeat;
 372         case '}':
 373           /* is this the last line of an enum declaration? */
 374           if (count == 0)
 375             {
 376               /* Put back the token we just read so's we can find it again
 377                  after registering the expression.  */
 378               unput(token);
 379
 380               lexstate = ST_NORMAL;
 381               token = EXPRESSION_PHRASE;
 382               break;
 383             }
 384           /* FALLTHRU */
 385         case ')': case ']':
 386           --count;
 387           APP;
 388           goto repeat;
 389         case ',': case ';':
 390           if (count == 0)
 391             {
 392               /* Put back the token we just read so's we can find it again
 393                  after registering the expression.  */
 394               unput(token);
 395
 396               lexstate = ST_NORMAL;
 397               token = EXPRESSION_PHRASE;
 398               break;
 399             }
 400           APP;
 401           goto repeat;
 402         default:
 403           APP;
 404           goto repeat;
 405         }
 406       break;
 407
 408     case ST_STATIC_ASSERT:
 409       APP;
 410       switch (token)
 411         {
 412         case '(':
 413           ++count;
 414           goto repeat;
 415         case ')':
 416           if (--count == 0)
 417             {
 418               lexstate = ST_NORMAL;
 419               token = STATIC_ASSERT_PHRASE;
 420               break;
 421             }
 422           goto repeat;
 423         default:
 424           goto repeat;
 425         }
 426       break;
 427
 428     default:
 429       exit(1);
 430     }
 431 fini:
 432
 433   if (suppress_type_lookup > 0)
 434     --suppress_type_lookup;
 435   if (dont_want_brace_phrase > 0)
 436     --dont_want_brace_phrase;
 437
 438   yylval = &next_node->next;
 439
 440   return token;
 441 }