編譯原理的實驗:完成對C語言的詞法分析
先說一下整體框架:
基類:Base 封裝了一些基礎的字符判斷函數,如下:
int charkind(char c);//判斷字符類型 int spaces(char c); //當前空格是否可以消除 int characters(char c);//是否是字母 int keyword(char str[]);//是否是關鍵字 int signwords(char str[]);//是否是標識符 int numbers(char c);//是否是數字 int integers(char str[]);//是否是整數 int floats(char str[]);//是否是浮點型
派生類 LexAn 繼承Base並且封裝了對行和單詞處理的函數,如下:
void scanwords(); //處理每一行 void clearnotes();//清除注釋和多余的空格 void getwords(int state);//處理出單詞 void wordkind(char str[]);//判斷單詞類型並且輸出
函數之間調用關系如下:

好了,整體框架說完了,我們來說具體的實現:
(一)清除注釋和多余的空格
(1)C語言的注釋有//和/* 兩種形式,所以如果當前讀進的是 / 只需分情況判斷下一個:
如果是/ 那麼本行 //之後的肯定都是注釋,只需要保存注釋,更新當前行即可;
如果是* ,那麼接著尋找直至 */位置,保存注釋,更新當前行,然後繼續這個操作(有可能有本行有多個 /* */).
不足:不能處理跨行注釋。
(2)處理多余的空格這裡較為草率,只處理了形如if ( a >= b ),即特殊符號和字母(數字)之間的空格;只要空格兩端有特殊符號,那麼去掉當前空格便不會造成錯誤。
void LexAn::clearnotes()
{
int i, j, k;
int noteCount = 0;
int flag = 0;
char note[100];
/*注釋*/
for (i = 0; bufferin[buffernum][i] != '\0'; i++)
{
if (bufferin[buffernum][i] == '"')
{
flag = 1 - flag;
continue;
}
if (bufferin[buffernum][i] == '/' && flag == 0)
{
if (bufferin[buffernum][i + 1] == '/')
{
for (j = i; bufferin[buffernum][j] != '\0'; j++)
{
note[noteCount++] = bufferin[buffernum][j];
}
note[noteCount] = '\0';
noteCount = 0;
fprintf(fout, " [ %s ] ---- [ 注釋 ]\n", note);
bufferin[buffernum][i] = '\0';
break;
}
if (bufferin[buffernum][i + 1] == '*')
{
note[noteCount++] = '/';
note[noteCount++] = '*';
for (j = i + 2; bufferin[buffernum][j] != '\0'; j++)
{
note[noteCount++] = bufferin[buffernum][j];
if (bufferin[buffernum][j] == '*' && bufferin[buffernum][j + 1] == '/')
{
j += 2;
note[noteCount++] = bufferin[buffernum][j];
note[noteCount] = '\0';
noteCount = 0;
fprintf(fout, " [ %s ] ---- [ 注釋 ]\n", note);
break;
}
}
for (; bufferin[buffernum][j] != '\0'; j++, i++)
{
bufferin[buffernum][i] = bufferin[buffernum][j];
}
if (bufferin[buffernum][j] == '\0')
{
bufferin[buffernum][i] = '\0';
}
}
}
}
//空格
for (i = 0, flag = 0; bufferin[buffernum][i] != '\0'; i++)
{
if (bufferin[buffernum][i] == '"')
{
flag = 1 - flag;
continue;
}
if (bufferin[buffernum][i] == ' ' && flag == 0)
{
for (j = i + 1; bufferin[buffernum][j] != '\0' && bufferin[buffernum][j] == ' '; j++)
{
}
if (bufferin[buffernum][j] == '\0')
{
bufferin[buffernum][i] = '\0';
break;
}
if (bufferin[buffernum][j] != '\0' && ((spaces(bufferin[buffernum][j]) == 1) || (i > 0 && spaces(bufferin[buffernum][i - 1]) == 1)))
{
for (k = i; bufferin[buffernum][j] != '\0'; j++, k++)
{
bufferin[buffernum][k] = bufferin[buffernum][j];
}
bufferin[buffernum][k] = '\0';
i--;
}
}
}
//制表符
for (i = 0, flag = 0; bufferin[buffernum][i] != '\0'; i++)
{
if (bufferin[buffernum][i] == '\t')
{
for (j = i; bufferin[buffernum][j] != '\0'; j++)
{
bufferin[buffernum][j] = bufferin[buffernum][j + 1];
}
i = -1;
}
}
}
畫圖不是很好話,我盡量用語言清除地描述,大家還需結合源碼分析:
主要分為 <字母, 1> <數字, 2> <$ _ , 3> <4 ,/ >(轉義) < = ,5> <0,else >
state初始值設為0:
(1)如果首位字符是字母,那麼只可能是標識符和關鍵字,之後遇到除 數字,字母,$,_,之外的字符結束,取出單詞。
(2)如果首位字符是數字,那麼只能是數字,即八進制,十六進制,. ,數字,$ ,之後遇到除上述之外的字符結束,取出單詞。
(3)如果首位是$ _ ,那麼只能是標識符,即字母,數字,$,之後遇到除上述之外的字符結束,取出單詞。
(4)如果首位是特殊字符(" . () = 等),那麼再分開處理,流程和上述的一致,遇到不可能的組合結束;這部分看代碼吧。
//狀態機
void LexAn::getwords(int state)
{
char word[100];
int charCount = 0;
int finish = 0;
int num;
int i, j, k;
for (i = 0; bufferscan[i] != '\0'; i++)
{
switch (state / 10)
{
case 0:
switch (charkind(bufferscan[i]))
{
case 1:
word[charCount++] = bufferscan[i];
state = 10;
break;
case 2:
word[charCount++] = bufferscan[i];
state = 20;
break;
case 3:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 0: case 5:
word[charCount++] = bufferscan[i];
switch (bufferscan[i])
{
case '"':
state = 41;
break;
case '\'':
state = 42;
break;
case '(': case ')': case '{': case '}': case '[': case ']': case ';': case ',': case '.':
state = 50;
word[charCount] = '\0';
finish = 1;
break;
case '=':
state = 43;
break;
default:
state = 40;
break;
}
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 1:
switch (charkind(bufferscan[i]))
{
case 1:
word[charCount++] = bufferscan[i];
state = 10;
break;
case 2:
word[charCount++] = bufferscan[i];
state = 20;
break;
case 3:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 0:case 5:
word[charCount] = '\0';
num = 0;
while (word[num] != '\0')
num++;
//長度的處理 !!
if (num>7)
word[7] = '\0';
i--;
finish = 1;
state = 50;
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 2:
switch (charkind(bufferscan[i]))
{
case 1:
word[charCount++] = bufferscan[i];
state = 20;
break;
case 2:
word[charCount++] = bufferscan[i];
state = 20;
break;
case 3:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 0:
if (bufferscan[i] == '.')
{
word[charCount++] = bufferscan[i];
state = 20;
break;
}
word[charCount] = '\0';
i--;
finish = 1;
state = 50;
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 3:
switch (charkind(bufferscan[i]))
{
case 1:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 2:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 3:
word[charCount++] = bufferscan[i];
state = 30;
break;
case 0:
word[charCount] = '\0';
i--;
finish = 1;
state = 50;
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 4:
switch (state)
{
case 40:
switch (charkind(bufferscan[i]))
{
case 1:
word[charCount] = '\0';
i--;
finish = 1;
state = 50;
break;
case 2:
word[charCount] = '\0';
i--;
finish = 1;
state = 50;
break;
case 3:
word[charCount] = '\0';
i--;
finish = 1;
state = 50;
break;
case 0:
word[charCount++] = bufferscan[i];
state = 40;
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 41:
word[charCount++] = bufferscan[i];
if (bufferscan[i] == '"')
{
if (charkind(bufferscan[i - 1]) == 4)
{
}
else
{
word[charCount] = '\0';
finish = 1;
state = 50;
}
}
break;
case 42:
word[charCount++] = bufferscan[i];
if (bufferscan[i] == '\'')
{
word[charCount] = '\0';
finish = 1;
state = 50;
}
break;
case 43:
if (bufferscan[i] == '=')
{
word[charCount++] = bufferscan[i];
state = 43;
}
else
{
word[charCount] = '\0';
finish = 1;
i--;
state = 50;
}
break;
default: word[charCount++] = bufferscan[i]; break;
}
break;
case 5:
finish = 0;
state = 0;
charCount = 0;
i--;
wordkind(word);
break;
default:break;
}
if (bufferscan[i + 1] == '\0')
{
word[charCount] = '\0';
wordkind(word);
}
}
}
(三)效果截圖:

本項目全部源碼放在個人 Github 上,歡迎大家star和fork學習哈。