This commit is contained in:
LiuYuanchi 2024-05-05 21:51:08 +08:00
commit d6861884f2
20 changed files with 2348 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*/build/
build/

24
CMakeLists.txt Normal file
View File

@ -0,0 +1,24 @@
cmake_minimum_required(VERSION 3.10)
project(compiler-bin)
# cpp
file(GLOB SOURCES_LL "LL1/*.cpp")
file(GLOB SOURCES_NFA "nfa/*.cpp")
file(GLOB SOURCES_MAIN "main/*.cpp")
# bin
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
#
add_library(LL STATIC ${SOURCES_LL})
add_library(nfa STATIC ${SOURCES_NFA})
#
target_include_directories(LL PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/LL)
target_include_directories(nfa PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/nfa)
#
add_executable(main ${SOURCES_MAIN})
#
target_link_libraries(main PRIVATE LL nfa)

14
LL1/CMakeLists.txt Normal file
View File

@ -0,0 +1,14 @@
cmake_minimum_required(VERSION 3.10)
project(LL)
# cpp
file(GLOB SOURCES "*.cpp")
# bin
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
#
add_library(LL STATIC ${SOURCES})
#
target_include_directories(LL PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

351
LL1/LL1.cpp Normal file
View File

@ -0,0 +1,351 @@
#include <set>
#include <algorithm>
#include <stack>
#include <cstring>
#include <fstream>
#include <iostream>
#include <sstream>
#include "LL1.h"
LL1::LL1()
{
read_grammar();
init_grammar_set();
}
LL1::~LL1()
{
}
bool LL1::IsLL1()
{
string symbol;
vector<string> right_first = vector<string>();
vector<string> left_follow;
for (int i = 0; i < grammar_rules.size(); i++) {
symbol.clear();
right_first.clear();
left_follow.clear();
symbol = grammar_rules[i].first;
// 计算 产生式左侧 FOLLOW 集
left_follow = follow[symbol];
// 计算 产生式右侧 FIRST 集
// 对 X1 的 非 $ 符号 加入
for (int j = 0; j < first[grammar_rules[i].second[0]].size(); j++) {
if (first[grammar_rules[i].second[0]][j] == "$") {
continue;
}
right_first.push_back(first[grammar_rules[i].second[0]][j]);
}
int cnt;
for (cnt = 1; cnt < grammar_rules[i].second.size(); cnt++) {
// 当且仅当 有 $ 符号时 继续加入
if (!infer_empty[grammar_rules[i].second[cnt - 1]]) {
break;
}
for (int j = 0; j < first[grammar_rules[i].second[cnt]].size(); j++) {
if (first[grammar_rules[i].second[cnt]][j] == "$") {
continue;
}
right_first.push_back(first[grammar_rules[i].second[cnt]][j]);
}
}
// 若都能推导至 $ 符号时 加入
if (cnt == grammar_rules[i].second.size() && infer_empty[grammar_rules[i].second[0]]) {
right_first.push_back("$");
}
// 对产生式右侧 FIRST 集 进行 去重
set<string> sright_first(right_first.begin(), right_first.end());
right_first.clear();
right_first.resize(sright_first.size());
right_first.assign(sright_first.begin(), sright_first.end());
vector<string> symbol_select;
// 若产生式右侧 FIRST 集为 {$} 时
if (right_first.size() == 1 && right_first[0] == "$") {
// SELECT 集为 产生式右侧 FOLLOW 集 与 {$} 的交集
symbol_select = left_follow;
if (find(left_follow.begin(), left_follow.end(), "$") == left_follow.end()) {
symbol_select.push_back("$");
}
}
else
{
// SELECT 集为 产生式左侧 FIRST 集
symbol_select = right_first;
}
// 对 SELECT 集 进行排序 方便接下来进行集合运算
sort(symbol_select.begin(), symbol_select.end());
vector<string> new_select = vector<string>();
// 判断 SELECT 表中有无现有数据
if (select.find(symbol) == select.end()) {
select[symbol] = symbol_select;
}
else {
// 判断两个相同产生式左侧 SELECT 集 是否相交
set_intersection(symbol_select.begin(), symbol_select.end(), select[symbol].begin(), select[symbol].end(), back_inserter(new_select));
if (new_select.size() == 0) {
// 不相交,继续运算,存入两者并集
set_union(symbol_select.begin(), symbol_select.end(), select[symbol].begin(), select[symbol].end(), back_inserter(new_select));
}
else
{
// 非 LL(1) 文法,退出
cout << "This grammar is not LL (1) grammar" << endl;
return false;
}
}
}
// cout << "该文法为 LL(1) 文法!" << endl;
return true;
}
void LL1::build_LL1_predict()
{
// 对每一个 非终结符 进行初始化行
for (int i = 0; i < VNs.size(); i++) {
if (LL1_predict.find(VNs[i]) == LL1_predict.end()) {
LL1_predict[VNs[i]] = unordered_map<string, int>();
}
}
string symbol;
vector<string> right_first = vector<string>();
vector<string> left_follow;
// 遍历 产生式 构建 预测分析表
for (int i = 0; i < grammar_rules.size(); i++) {
symbol.clear();
right_first.clear();
left_follow.clear();
symbol = grammar_rules[i].first;
// 计算 产生式左侧 FOLLOW 集
left_follow = follow[symbol];
unordered_map<string, int> &symbol_predict = LL1_predict[symbol];
// 计算 产生式右侧 FIRST 集
// 对 X1 的 非 $ 符号 加入
for (int j = 0; j < first[grammar_rules[i].second[0]].size(); j++) {
if (first[grammar_rules[i].second[0]][j] == "$") {
continue;
}
right_first.push_back(first[grammar_rules[i].second[0]][j]);
}
int cnt;
for (cnt = 1; cnt < grammar_rules[i].second.size(); cnt++) {
// 当且仅当 有 $ 符号时 继续加入
if (!infer_empty[grammar_rules[i].second[cnt - 1]]) {
break;
}
for (int j = 0; j < first[grammar_rules[i].second[cnt]].size(); j++) {
if (first[grammar_rules[i].second[cnt]][j] == "$") {
continue;
}
right_first.push_back(first[grammar_rules[i].second[cnt]][j]);
}
}
// 若都能推导至 $ 符号时 加入
if (cnt == grammar_rules[i].second.size() && infer_empty[grammar_rules[i].second[0]]) {
right_first.push_back("$");
}
// 对产生式右侧 FIRST 集 进行 去重
set<string> sright_first(right_first.begin(), right_first.end());
right_first.clear();
right_first.resize(sright_first.size());
right_first.assign(sright_first.begin(), sright_first.end());
// 循环遍历 FIRST 集进行初始化
for (int j = 0; j < right_first.size(); j++) {
if (right_first[j] == "$") {
pair<string, vector<string>> new_rule (grammar_rules[i].first, vector<string>());
new_rule.second.push_back("$");
int rule_id = insert_rule(new_rule);
for (int k = 0; k < left_follow.size(); k++) {
symbol_predict[left_follow[k]] = rule_id;
}
}
symbol_predict[right_first[j]] = i;
}
}
}
void LL1::print_LL1_predict()
{
cout << "[LL1_predict]:" << endl;
for (auto iter = LL1_predict.begin(); iter != LL1_predict.end(); ++iter) {
cout << (*iter).first << " ";
for (auto j = (*iter).second.begin(); j != (*iter).second.end(); ++j) {
cout << (*j).first << "," << (*j).second << " ";
}
cout << endl;
}
cout << endl << endl;
}
void LL1::build_LL1_grammar()
{
// 符号栈
stack<string> stack;
int token_cnt = 0;
// 起始符 入栈
stack.push(start);
while (!stack.empty())
{
LL1_grammar_log.push_back(string());
// 栈顶符号
// 判断栈顶是否为 空符号
if (stack.top() == "$") {
// 栈空 以 EOF 表示
LL1_grammar_log.back() += "EOF";
}
else
{
LL1_grammar_log.back() += stack.top();
}
// 添加 # 分割
LL1_grammar_log.back() += "#";
// 面临输入的符号
string this_token;
if (token_cnt == token_strings.size()) {
// 栈空 以 EOF 表示
this_token = "$";
LL1_grammar_log.back() += "EOF";
}
else
{
this_token = token_strings[token_cnt];
LL1_grammar_log.back() += token_strings[token_cnt];
}
// 对栈顶元素与即将输入的符号进行比较
if (stack.top() == this_token) {
// 栈顶出栈 token 指向下一位
token_cnt++;
stack.pop();
if (this_token == "$") {
// 分析成功 结束分析
LL1_grammar_log.back() += "\taccept";
}
else
{
// 跳过
LL1_grammar_log.back() += "\tmove";
}
}
// 若为终结符
else if (find(VTs.begin(), VTs.end(), stack.top()) != VTs.end()) {
if (stack.top() == "$") {
stack.pop();
LL1_grammar_log.pop_back();
}
else {
LL1_grammar_log.back() += "\terror";
return;
}
}
else
{
auto tab = LL1_predict[stack.top()];
if (tab.find(this_token) == tab.end()) {
LL1_grammar_log.back() += "\terror";
return;
}
else
{
auto this_rule = grammar_rules[tab[this_token]];
stack.pop();
for (int i = this_rule.second.size() - 1; i >= 0; i--) {
stack.push(this_rule.second[i]);
}
LL1_grammar_log.back() += "\treduction";
}
}
}
}
void LL1::print_LL1_grammar_log()
{
for (int i = 0; i < LL1_grammar_log.size(); ++i) {
cout << LL1_grammar_log[i] << endl;
}
}
void LL1::fileout_LL1_grammar_log(string file_name)
{
//打开结果输出文件
fstream outfile(file_name);
if (!outfile.is_open()) {
cout << "[FILEOUT] fail to open file" << endl;
}
for (int i = 0; i < LL1_grammar_log.size(); ++i) {
outfile << LL1_grammar_log[i] << endl;
}
outfile.close();
}
int LL1::insert_rule(pair<string, vector<string>>& new_rule)
{
int cnt;
for (cnt = 0; cnt < grammar_rules.size(); cnt++) {
// 当 产生式规则 中存在这条产生式时 返回序号
if (grammar_rules[cnt].first == new_rule.first && grammar_rules[cnt].second == new_rule.second) {
return cnt;
}
}
// 若不存在 返回序号的同时加入
grammar_rules.push_back(new_rule);
return cnt;
}

32
LL1/LL1.h Normal file
View File

@ -0,0 +1,32 @@
// LL1 语法分析器
#ifndef LL1_H
#define LL1_H
#include "grammar.h"
using namespace std;
class LL1:public Grammar{
public:
LL1();
~LL1();
bool IsLL1(); // 判断该文法是否为 LL1 文法
void build_LL1_predict(); // 构建 LL1 的预测分析表
void print_LL1_predict(); // 打印 LL1 的预测分析表
void build_LL1_grammar(); // 构建规约序列
void print_LL1_grammar_log();
void fileout_LL1_grammar_log(string file_name);
private:
unordered_map<string, vector<string>> select; // 计算符号的 SELECT 集合
unordered_map<string, unordered_map<string, int>> LL1_predict; // LL1 的预测分析表
vector<string> LL1_grammar_log; // 规约序列
int insert_rule(pair<string, vector<string>>& new_rule); // 增加新的规则
};
#endif // !LL1_H

520
LL1/grammar.cpp Normal file
View File

@ -0,0 +1,520 @@
#include <deque>
#include <cstring>
#include <fstream>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <set>
#include "grammar.h"
Grammar::Grammar()
{
}
Grammar::~Grammar()
{
}
void Grammar::read_grammar() {
fstream infile;
infile.open(grammar_file,ios::in);
if (!infile.is_open())
{
cout << "[READ_GRAMMAR] fail to open file: "<< grammar_file << endl;
return;
}
string buf;
string arrow = "->";
string farrow;
bool start_flag = true;
string left;
string forms;
while (!infile.eof()) {
// 清理 string
buf.clear();
left.clear();
forms.clear();
farrow.clear();
grammar_rules.push_back(pair<string, vector<string>>());
getline(infile, buf);
stringstream ss(buf);
// 读取产生式左侧
ss >> left;
grammar_rules.back().first = left;
symbols.push_back(left);
VNs.push_back(left);
// 存储 start
if (start_flag) {
start = left;
start_flag = false;
}
// 读取 -> 符号 并保证合法
ss >> farrow;
if (farrow != arrow) {
cout << "Grammar reading error" << endl;
}
// 读取产生式右侧
while (ss >> forms)
{
grammar_rules.back().second.push_back(forms);
symbols.push_back(forms);
forms.clear();
}
}
// 符号集 和 非终结符 去重
set<string> ssymbols(symbols.begin(), symbols.end());
symbols.clear();
symbols.resize(ssymbols.size());
symbols.assign(ssymbols.begin(), ssymbols.end());
set<string> sVNs(VNs.begin(), VNs.end());
VNs.clear();
VNs.resize(sVNs.size());
VNs.assign(sVNs.begin(), sVNs.end());
// 符号集 和 非终结符 排序 以保证差集的成功
sort(symbols.begin(), symbols.end());
sort(VNs.begin(), VNs.end());
// 取差集 得到终极符
set_difference(symbols.begin(), symbols.end(), VNs.begin(), VNs.end(), back_inserter(VTs));
infile.close();
}
void Grammar::print_grammar()
{
cout << "[start]: " << endl << start << endl << endl;
cout << "[VTs]:" << endl;
for (int i = 0; i < VTs.size(); i++) {
cout << VTs[i] << " ";
if (((i + 1) % 5) == 0)
cout << endl;
}
cout << endl << endl;
cout << "[VNs]:" << endl;
for (int i = 0; i < VNs.size(); i++) {
cout << VNs[i] << " ";
if (((i + 1) % 5) == 0)
cout << endl;
}
cout << endl << endl;
cout << "[symbols]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << symbols[i] << " ";
if (((i + 1) % 5) == 0)
cout << endl;
}
cout << endl << endl;
cout << "[grammar_rules]: " << grammar_rules.size() << endl;
for (int i = 0; i < grammar_rules.size(); ++i) {
cout << grammar_rules[i].first << " -> ";
for (int j = 0; j < grammar_rules[i].second.size(); ++j) {
cout << "\"" << grammar_rules[i].second[j] << "\" ";
}
cout << endl;
}
cout << endl << endl;
}
void Grammar::expand_grammar()
{
string new_start = start + "\'";
pair<string, vector<string>> new_rule = pair<string, vector<string>>(new_start, vector<string>());
new_rule.second.push_back(start);
VNs.push_back(new_start);
symbols.push_back(new_start);
grammar_rules.insert(grammar_rules.begin(), new_rule);
start = new_start;
// 符号集排序
sort(symbols.begin(), symbols.end());
}
void Grammar::init_grammar_set()
{
string symbol;
// 对符号集中各符号进行推导 是否可以到达 $ 空符号
for (int i = 0; i < symbols.size(); i++) {
symbol = symbols[i];
this->symbol_infer_empty(symbol);
symbol.clear();
}
// 初始化符号在产生式的 出现 依赖 情况
init_appears_depend();
// 对符号集中各符号进行推导 FIRST 集
for (int i = 0; i < symbols.size(); i++) {
symbol = symbols[i];
this->symbol_infer_first(symbol);
symbol.clear();
}
// 对符号集中各符号进行推导 FOLLOW 集
// 符号队列
deque<string> queue;
// 初次遍历所有符号 生成初始的 FOLLOW 集
// 构建 start 的 FOLLOW 集
follow[start] = this->symbol_infer_follow(start);
follow[start].push_back("$");
queue.push_back(start);
// 构建除 start 的 FOLLOW 集
for (int i = 0; i < symbols.size(); i++) {
symbol = symbols[i];
if (symbol == start) {
symbol.clear();
continue;
}
follow[symbol] = this->symbol_infer_follow(symbol);
queue.push_back(symbol);
symbol.clear();
}
// 对 符号队列 进行进一步生成
while (!queue.empty()) {
// 读取 符号队列 开头
symbol = queue.front();
queue.pop_front();
// 若 FOLLOW 集发生改变
vector<string> new_symbol_follow = this->symbol_infer_follow(symbol);
if (follow[symbol].size() < new_symbol_follow.size()) {
// 对依赖 该符号 的所有符号添加至 符号队列
vector<string> dep = depend[symbol];
for (int i = 0; i < dep.size(); i++) {
queue.push_back(dep[i]);
}
follow[symbol] = new_symbol_follow;
}
symbol.clear();
}
}
void Grammar::print_grammar_set()
{
// 打印符号在产生式的出现情况
cout << "[left_appears]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << "LEFT( " << symbols[i] << " ) = {";
for (int j = 0; j < left_appears[symbols[i]].size(); j++) {
cout << " " << left_appears[symbols[i]][j] << " ";
}
cout << "}" << endl;
}
cout << endl << endl;
cout << "[right_appears]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << "RIGHT( " << symbols[i] << " ) = {";
for (int j = 0; j < right_appears[symbols[i]].size(); j++) {
cout << " " << right_appears[symbols[i]][j] << " ";
}
cout << "}" << endl;
}
cout << endl << endl;
// 打印 FOLLOW 集的依赖关系
cout << "[depend]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << "DEPEND( " << symbols[i] << " ) = {";
for (int j = 0; j < depend[symbols[i]].size(); j++) {
cout << " " << depend[symbols[i]][j] << " ";
}
cout << "}" << endl;
}
cout << endl << endl;
// 打印是否可以推导出 $ 空符号
cout << "[infer_empty]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << symbols[i]<<" -> " << infer_empty[symbols[i]] << endl;
}
cout << endl << endl;
// 打印 FIRST 集
cout << "[FIRST]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << "FIRST( " << symbols[i] << " ) = {";
for (int j = 0; j < first[symbols[i]].size(); j++) {
cout << " " << first[symbols[i]][j] << " ";
}
cout << "}" << endl;
}
cout << endl << endl;
// 打印 FOLLOW 集
cout << "[FOLLOW]:" << endl;
for (int i = 0; i < symbols.size(); i++) {
cout << "FOLLOW( " << symbols[i] << " ) = {";
for (int j = 0; j < follow[symbols[i]].size(); j++) {
cout << " " << follow[symbols[i]][j] << " ";
}
cout << "}" << endl;
}
cout << endl << endl;
}
void Grammar::get_token_strings(vector<string>& my_token_strings)
{
token_strings.resize(my_token_strings.size());
token_strings.assign(my_token_strings.begin(), my_token_strings.end());
}
void Grammar::print_token_strings()
{
for (int i = 0; i < token_strings.size(); i++) {
cout << token_strings[i] << endl;
}
}
void Grammar::init_appears_depend()
{
for (int k = 0; k < symbols.size(); k++) {
left_appears[symbols[k]] = vector<int>();
right_appears[symbols[k]] = vector<int>();
depend[symbols[k]] = vector<string>();
for (int i = 0; i < grammar_rules.size(); i++) {
if (grammar_rules[i].first == symbols[k]) {
// 产生式左侧相等 存入 left
left_appears[symbols[k]].push_back(i);
// 对该产生式构建依赖关系
for (int m = 0; m < grammar_rules[i].second.size(); m++) {
int n;
// 判断该产生式右侧符号是否可以推导至 $ 空符号
for (n = m + 1; n < grammar_rules[i].second.size(); n++) {
if (!infer_empty[grammar_rules[i].second[n]]) {
break;
}
}
// 若可以推导 按照入栈的方式依次加入
if (n == grammar_rules[i].second.size()) {
if (symbols[k] != grammar_rules[i].second[m]) {
depend[symbols[k]].push_back(grammar_rules[i].second[m]);
}
}
}
}
for (int j = 0; j < grammar_rules[i].second.size(); j++) {
// 产生式右侧相等 存入 left
if (grammar_rules[i].second[j] == symbols[k]) {
right_appears[symbols[k]].push_back(i);
break;
}
}
}
}
}
bool Grammar::symbol_infer_empty(const string& symbol) {
// 已经进行推导过
if (infer_empty.find(symbol) != infer_empty.end()) {
return infer_empty[symbol];
}
// 当符号为终结符时,当且仅当为 $ 可以推导出 $
if (find(VTs.begin(), VTs.end(), symbol) != VTs.end()) {
infer_empty[symbol] = (symbol == "$") ;
return infer_empty[symbol];
}
// 当符号为非终结符时,通过产生式进行推导
for (int i = 0; i < grammar_rules.size(); i++) {
// 当该符号为产生式左侧时
if (grammar_rules[i].first == symbol) {
int j;
vector<string> rule_right = grammar_rules[i].second;
for (j = 0; j < rule_right.size(); j++) {
// 递归推导 产生式右侧无法推导至 $ 时
if (!(this->symbol_infer_empty(rule_right[j]))) {
break;
}
}
// 当且仅当产生式右侧可以推导至 $ 时
if (j == rule_right.size()) {
infer_empty[symbol] = true;
return infer_empty[symbol];
}
}
}
// 当各产生式都无法推导至 $ 时,则无法推导
infer_empty[symbol] = false;
return infer_empty[symbol];
}
vector<string> Grammar::symbol_infer_first(const string& symbol)
{
// 已经推导过 FIRST 集
if (first.find(symbol) != first.end()) {
return first[symbol];
}
vector<string> symbol_first;
// 当符号为终结符时 FIRST 集为它本身
if (find(VTs.begin(), VTs.end(), symbol) != VTs.end()) {
symbol_first.push_back(symbol);
first[symbol] = symbol_first;
return first[symbol];
}
// 当符号为非终结符时,通过产生式进行推导
for (int i = 0; i < grammar_rules.size(); i++) {
// 当该符号为产生式左侧时
if (grammar_rules[i].first == symbol) {
int j;
for (j = 0; j < grammar_rules[i].second.size(); j++) {
// 依次添加所有产生式右侧的
vector<string> firsts = symbol_infer_first(grammar_rules[i].second[j]);
for (int k = 0; k < firsts.size(); k++) {
symbol_first.push_back(firsts[k]);
}
// 若产生式右侧无法推导至 $ 空字符时 中断
if (!infer_empty[grammar_rules[i].second[j]]) {
break;
}
}
// 当且仅当产生式右侧可以推导至 $ 时 将 $ 加入到 FIRST 集中
if (j == grammar_rules[i].second.size()) {
symbol_first.push_back("$");
}
}
}
// 对当前 FIRST 集进行 去重 与 排序
set<string> ssymbol_first(symbol_first.begin(), symbol_first.end());
symbol_first.clear();
symbol_first.resize(ssymbol_first.size());
symbol_first.assign(ssymbol_first.begin(), ssymbol_first.end());
sort(symbol_first.begin(), symbol_first.end());
// 返回非终结符的 FIRST 集
first[symbol] = symbol_first;
return first[symbol];
}
vector<string> Grammar::symbol_infer_follow(const string& symbol)
{
vector<string> symbol_follow;
// 获取该符号出现在哪些产生式右侧
vector<int> right_appear = right_appears[symbol];
for (int i = 0; i < right_appear.size(); i++) {
int cnt;
// 获取该产生式右侧的符号
vector<string> rule_right = grammar_rules[right_appear[i]].second;
// 依次遍历 该产生式右侧 至 该符号 后一位
for (cnt = 0; cnt < rule_right.size(); cnt++) {
if (rule_right[cnt] == symbol) {
break;
}
}
cnt++;
// 遍历 剩余产生式右侧
for (; cnt < rule_right.size(); cnt++) {
// 依次获取 后置元素 的 FIRST 集
vector<string> symbol_first = first[rule_right[cnt]];
// 将 该 FIRST 集 循环添加至 symbol_follow 中
for (int j = 0; j < symbol_first.size(); j++) {
symbol_follow.push_back(symbol_first[j]);
}
// 若不可达 $ 中断遍历
if (!infer_empty[rule_right[cnt]]) {
break;
}
}
// 当剩余产生式右侧均可到达 $ 时
if (cnt == rule_right.size()) {
if (follow.find(grammar_rules[right_appear[i]].first) != follow.end()) {
// 将产生式左侧的 FOLLOW 集 加入到 当前符号的 FOLLOW 集中
vector<string> first_follow = follow[grammar_rules[right_appear[i]].first];
for (int j = 0; j < first_follow.size(); j++) {
symbol_follow.push_back(first_follow[j]);
}
}
}
}
// 删除不需要的 $ 空字符
auto it = remove(symbol_follow.begin(), symbol_follow.end(), "$");
auto it1 = symbol_follow.erase(it, symbol_follow.end());
// 对当前 FOLLOW 集 进行去重排序
set<string> ssymbol_follow(symbol_follow.begin(), symbol_follow.end());
symbol_follow.clear();
symbol_follow.resize(ssymbol_follow.size());
symbol_follow.assign(ssymbol_follow.begin(), ssymbol_follow.end());
sort(symbol_follow.begin(), symbol_follow.end());
return symbol_follow;
}

55
LL1/grammar.h Normal file
View File

@ -0,0 +1,55 @@
// 语法生成器
#ifndef GRAMMAR_H
#define GRAMMAR_H
#include <string>
#include <vector>
#include <map>
#include <unordered_set>
#include <unordered_map>
using namespace std;
class Grammar
{
public:
const string grammar_file = "./tests/grammar.txt";
Grammar();
~Grammar();
void read_grammar(); // 读取语法规则
void print_grammar(); // 打印语法规则
void expand_grammar(); // 拓展语法规则
void init_grammar_set(); // 初始化语法相关集合
void print_grammar_set(); // 打印语法相关集合
void get_token_strings(vector<string> &); // 获取 token_stirngs
void print_token_strings();
protected:
vector<pair<string, vector<string>>> grammar_rules; // 产生式规则
string start; // 起始字符
vector<string> symbols; // 符号
vector<string> VTs; // 终结符
vector<string> VNs; // 非终结符
unordered_map<string, vector<string>> first; // FIRST 集
unordered_map<string, vector<string>> follow; // FOLLOW 集
unordered_map<string, bool> infer_empty; // 是否可以推导出 $ 空字符
vector<string> token_strings;
private:
unordered_map<string, vector<int>> left_appears; // 该符号出现在哪些产生式左侧
unordered_map<string, vector<int>> right_appears; // 该符号出现在哪些产生式右侧
unordered_map<string, vector<string>> depend; // FOLLOW 集的依赖关系
void init_appears_depend(); // 获取 appear depend 集合
bool symbol_infer_empty(const string& symbol); // 判断符号是否可以推导出 $ 空字符
vector<string> symbol_infer_first(const string& symbol);// 推导符号的 FIRST 集
vector<string> symbol_infer_follow(const string& symbol);// 推导符号的 FOLLOW 集
};
#endif // !GRAMMAR_H

9
README.md Normal file
View File

@ -0,0 +1,9 @@
三个模块 LL1 NFA MAIN
MAIN负责整合LL1和NFA
每个模块基于 cmake 实现静态连接,cmakelist已经写好
```
mkdir build
cd build
```

BIN
bin/LL.lib Normal file

Binary file not shown.

BIN
bin/nfa.lib Normal file

Binary file not shown.

14
main/CMakeLists.txt Normal file
View File

@ -0,0 +1,14 @@
cmake_minimum_required(VERSION 3.10)
project(main)
file(GLOB SOURCES "*.cpp")
add_executable(main ${SOURCES})
#
target_link_libraries(main PRIVATE ${CMAKE_BINARY_DIR}/../../bin/LL.lib)
target_link_libraries(main PRIVATE ${CMAKE_BINARY_DIR}/../../bin/nfa.lib)
#
# target_include_directories(main PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

32
main/LL1.h Normal file
View File

@ -0,0 +1,32 @@
// LL1 语法分析器
#ifndef LL1_H
#define LL1_H
#include "grammar.h"
using namespace std;
class LL1:public Grammar{
public:
LL1();
~LL1();
bool IsLL1(); // 判断该文法是否为 LL1 文法
void build_LL1_predict(); // 构建 LL1 的预测分析表
void print_LL1_predict(); // 打印 LL1 的预测分析表
void build_LL1_grammar(); // 构建规约序列
void print_LL1_grammar_log();
void fileout_LL1_grammar_log(string file_name);
private:
unordered_map<string, vector<string>> select; // 计算符号的 SELECT 集合
unordered_map<string, unordered_map<string, int>> LL1_predict; // LL1 的预测分析表
vector<string> LL1_grammar_log; // 规约序列
int insert_rule(pair<string, vector<string>>& new_rule); // 增加新的规则
};
#endif // !LL1_H

55
main/grammar.h Normal file
View File

@ -0,0 +1,55 @@
// 语法生成器
#ifndef GRAMMAR_H
#define GRAMMAR_H
#include <string>
#include <vector>
#include <map>
#include <unordered_set>
#include <unordered_map>
using namespace std;
class Grammar
{
public:
const string grammar_file = "./tests/grammar.txt";
Grammar();
~Grammar();
void read_grammar(); // 读取语法规则
void print_grammar(); // 打印语法规则
void expand_grammar(); // 拓展语法规则
void init_grammar_set(); // 初始化语法相关集合
void print_grammar_set(); // 打印语法相关集合
void get_token_strings(vector<string> &); // 获取 token_stirngs
void print_token_strings();
protected:
vector<pair<string, vector<string>>> grammar_rules; // 产生式规则
string start; // 起始字符
vector<string> symbols; // 符号
vector<string> VTs; // 终结符
vector<string> VNs; // 非终结符
unordered_map<string, vector<string>> first; // FIRST 集
unordered_map<string, vector<string>> follow; // FOLLOW 集
unordered_map<string, bool> infer_empty; // 是否可以推导出 $ 空字符
vector<string> token_strings;
private:
unordered_map<string, vector<int>> left_appears; // 该符号出现在哪些产生式左侧
unordered_map<string, vector<int>> right_appears; // 该符号出现在哪些产生式右侧
unordered_map<string, vector<string>> depend; // FOLLOW 集的依赖关系
void init_appears_depend(); // 获取 appear depend 集合
bool symbol_infer_empty(const string& symbol); // 判断符号是否可以推导出 $ 空字符
vector<string> symbol_infer_first(const string& symbol);// 推导符号的 FIRST 集
vector<string> symbol_infer_follow(const string& symbol);// 推导符号的 FOLLOW 集
};
#endif // !GRAMMAR_H

80
main/main.cpp Normal file
View File

@ -0,0 +1,80 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include "nfa.h"
#include "grammar.h"
#include "LL1.h"
using namespace std;
int main(int argc, char** argv) {
NFA nfa = RexToNFA();
printNFA(nfa);
DFA dfa = nfaToDFA(nfa);
//printDFA(dfa);
DFA minimizedDFA = minimizeDFA(minimizeDFA(dfa));
removeUnreachableStates(minimizedDFA);
//printDFA(minimizedDFA);
string inputs[6] = {
"tests/00/00.txt",
"tests/01/01.txt",
"tests/02/02.txt",
"tests/07/07.txt",
"tests/08_err/08.txt",
"tests/10_err/10.txt"
};
string outputs_lexical[6] = {
"tests/00/00_my_lexical.txt",
"tests/01/01_my_lexical.txt",
"tests/02/02_my_lexical.txt",
"tests/07/07_my_lexical.txt",
"tests/08_err/08_my_lexical.txt",
"tests/10_err/10_my_lexical.txt"
};
string outputs_grammar[6] = {
"tests/00/00_my_grammar.txt",
"tests/01/01_my_grammar.txt",
"tests/02/02_my_grammar.txt",
"tests/07/07_my_grammar.txt",
"tests/08_err/08_my_grammar.txt",
"tests/10_err/10_my_grammar.txt"
};
int i = 0;
for (auto input : inputs) {
LL1 ll;
//ll.print_grammar_set();
string content = readfile(input);
vector<string> token_strings = recognize(minimizedDFA, content,outputs_lexical[i]);
bool flag = ll.IsLL1();
ll.build_LL1_predict();
// ll.print_LL1_predict();
ll.get_token_strings(token_strings);
// ll.print_token_strings();
ll.build_LL1_grammar();
ll.fileout_LL1_grammar_log(outputs_grammar[i]);
// ll.print_LL1_grammar_log();
cout << endl;
i++;
}
return 0;
}

173
main/nfa.h Normal file
View File

@ -0,0 +1,173 @@
#pragma once
#ifndef __NFA__H__
#define __NFA__H__
#include <map>
#include <set>
#include <deque>
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#include <stdio.h>
#include <sstream>
#include <stack>
#include <queue>
#include <algorithm>
using namespace std;
//单词符号的类型,返回<待测代码中的单词符号,WordType>
typedef enum WordType {
//当识别成标识符后先判断是不是保留字让后再判断IDN
KW_INT = 0, // int
KW_VOID, // void
KW_RETURN, // return
KW_CONST, // const
OP_ADD, // +
OP_SUB, // -
OP_MUL, // *
OP_DIV, // /
OP_MOD, // %
OP_ASSIGN, // =
OP_GT, // >
OP_LT, // <
OP_EQ, // ==
OP_LE, // <=
OP_GE, // >=
OP_NE, // !=
OP_AND, // &&
OP_OR, // ||
SE_LBRAC, // ( left backet
SE_RBRAC, // ) right bracket
SE_LCBRAC, // { left curly bracket
SE_RCBRAC, // } right curly bracket
SE_COMMA, // ,
SE_SEMI, // ;
IDN, // [a-zA-Z][a-zA-Z_0-9]*
INT_VAL, // -*[0-9]+
UNKOWN
}WordType;
string getWordTypeName(WordType type);
//定义输入的字符类别
typedef enum InputCharType {
LETTER = 0, // 字母 0
UNDERLINE, // _ 1
DIGIT, // 数字 2 当识别成功一个数字时为了避免出现数字01的情况返回前先进行一个判断对GCC01可以识别并等于1的
//OP
ADD, // + 3
SUB, // - 4
MUL, // * 5
DIV, // / 6
MOD, // % 7
EQ, // = 8
GT, // > 9
LT, // < 10
NOT, // ! 11
AND, // & 12
OR, // | 13
//SE
LBRACKET, // ( 14
RBRACKET, // ) 15
LCBRAC, // { 16
RCBRAC, // } 17
COMMA, // , 18
SEMI, // ; 19
EPSILON, // 空字符 20
}InputCharType;
string getInputChartypeName(InputCharType type);
enum class TokenType {
KW = 0,
OP,
SE,
IDN,
INT,
UNKNOWN
};
TokenType getTokenType(WordType wordType,string buffer);
typedef struct Token {
string value;
TokenType type;
} Token;
//定义函数判断输入的字符类别
InputCharType getInputCharType(char c);
string getWordTypeName(WordType type,string buffer);
//定义状态类
class State {
public:
int id; // 状态编号
map<InputCharType, set<State*>> transitions; // 转移函数映射表,记录每个输入字符类型对应的目标状态集合
bool isFinalState; // 是否为最终状态
WordType wordType; // 到达该状态时应该返回的词法单元类型
State(int id) : id(id), isFinalState(false), wordType(UNKOWN) {}
void addTransition(InputCharType input, State* targetState) {
transitions[input].insert(targetState);
}
void setFinalState(bool isFinal, WordType type) {
isFinalState = isFinal;
wordType = type;
}
bool operator<(const State& other) const {
return id < other.id;
}
};
//为了是set内部有序定义排序结构体StatePtrCompare
struct StatePtrCompare {
bool operator()(const State* lhs, const State* rhs) const {
return lhs->id < rhs->id;
}
};
//定义NFA类
class NFA {
public:
State* startState; // 起始状态
set<State*, StatePtrCompare> endStates; // 终止状态集合
set<State*, StatePtrCompare> states; // 状态集合
NFA(State* startState, set<State*, StatePtrCompare> endStates, set<State*, StatePtrCompare> states) :
startState(startState), endStates(endStates), states(states) {}
// void printNFA();
};
NFA RexToNFA();
void printNFA(const NFA& nfa);
NFA buildNFA(string filename);
NFA RexToNFA();
set<State*, StatePtrCompare> move(const set<State*, StatePtrCompare>& states, InputCharType input);
set<State*, StatePtrCompare> epsilonClosure(const set<State*, StatePtrCompare>& states);
class DFA {
public:
State* startState; // 起始状态
set<State*, StatePtrCompare> endStates; // 终止状态集合
set<State*, StatePtrCompare> states; // 状态集合
DFA(State* startState, set<State*, StatePtrCompare> endStates, set<State*, StatePtrCompare> states) :
startState(startState), endStates(endStates), states(states) {}
};
void removeUnreachableStates(DFA& dfa);
void printDFA(const DFA& dfa);
DFA nfaToDFA(const NFA& nfa);
void printDFA(const DFA& dfa);
struct SetComparator {
bool operator()(const set<State*, StatePtrCompare>& a, const set<State*, StatePtrCompare>& b) const {
if (a.size() != b.size()) {
return a.size() < b.size();
}
vector<State*> vecA(a.begin(), a.end());
vector<State*> vecB(b.begin(), b.end());
sort(vecA.begin(), vecA.end(), [](const State* a, const State* b) { return a->id < b->id; });
sort(vecB.begin(), vecB.end(), [](const State* a, const State* b) { return a->id < b->id; });
return vecA < vecB;
}
};
string getGrammarName(WordType type, string buffer);
DFA minimizeDFA(const DFA& dfa);
vector<string> recognize(const DFA& dfa, const string& input, const string& output);
string readfile(const string& filename);
#endif

14
nfa/CMakeLists.txt Normal file
View File

@ -0,0 +1,14 @@
cmake_minimum_required(VERSION 3.10)
project(nfa)
# cpp
file(GLOB SOURCES "*.cpp")
# bin
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
#
add_library(nfa STATIC ${SOURCES})
#
target_include_directories(nfa PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

251
nfa/dfa.cpp Normal file
View File

@ -0,0 +1,251 @@
#include "nfa.h"
class Partition {
public:
set<State*, StatePtrCompare> states;
Partition(set<State*, StatePtrCompare> states) : states(states) {}
};
/*
N和A两个集合
S = {N,A}S中的状态集进行划分S仍然在扩大S不再扩大
c can split s这里s指的是S中的一个状态集
1.s中每个状态c之后到达的状态
2.S中现在有的状态集
3.s分割
s中分割出去s最后保留下来的是吃了字符c还在状态集s中的状态或者吃不了c字符的状态
*/
// split 函数用于将给定的状态集合group根据转移函数进一步细分。
// group: 要细分的状态集合
// input: 当前考虑的输入字符类型
// partitions: 存储所有分区的集合,如果需要细分,将在该集合中添加新分区
void split(const set<State*, StatePtrCompare>& group, InputCharType input, set<Partition*>& partitions) {
// 用于存储每个目标分区与对应新分组状态集合的映射
map<Partition*, set<State*, StatePtrCompare>> targetPartitionsMap;
for (State* state : group) {
auto it = state->transitions.find(input);
if (it != state->transitions.end()) {
State* targetState = *(it->second.begin());//DFA状态转移具有唯一性
// 在当前所有分区中查找包含目标状态的分区
for (Partition* partition : partitions) {
if (partition->states.find(targetState) != partition->states.end()) {
// 在映射表中将当前状态添加到对应的目标分区
targetPartitionsMap[partition].insert(state);
break;
}
}
}
}
// 经过上述操作将在group里的状态根据到达目标Partiset<State*, StatePtrCompare>分到不同set<State*>
// 遍历目标分区映射表检查是否需要进一步细分即将经过input输入状态转换后处于不同目标分区的集合内部拆分开
for (auto& entry : targetPartitionsMap) {
Partition* targetPartition = entry.first;
//到达该targetPartition的group部分状态合集如下
set<State*, StatePtrCompare>& newGroupStates = entry.second;
//等于的情况不拆分不会出现大于的情况将targetPartition拆分开来也可以将到达不同割集的源状态分割开来也可以分割目标状态总之是状态转移结果在现存割集即可
if (newGroupStates.size() < targetPartition->states.size()) {
for (State* state : newGroupStates) {
targetPartition->states.erase(state);
}
Partition* newGroup = new Partition(newGroupStates);
partitions.insert(newGroup);
}
}
}
DFA minimizeDFA(const DFA& dfa) {
set<Partition*> partitions;
// 将所有非终止状态分成一组,将所有终止状态按照 WordType 分组
/*
* WordType
* ,
*/
map<WordType, set<State*, StatePtrCompare>> endStateGroups; //初始终态集合
set<State*, StatePtrCompare> nonEndStates; //初始非终态集合
for (State* state : dfa.states) {
if (state->isFinalState) {
endStateGroups[state->wordType].insert(state);//使用wordType对终态集合进一步拆分
}
else {
nonEndStates.insert(state);
}
}
//构造初始分割,是对{N,A}中A的扩展即终态加快算法速度扩展原因见上
for (auto& entry : endStateGroups) {
Partition* endStateGroup = new Partition(entry.second);
partitions.insert(endStateGroup);
}
Partition* nonEndStateGroup = new Partition(nonEndStates);
partitions.insert(nonEndStateGroup);
//对现有分隔进行再分隔,以获得最小化分割
size_t oldSize;//分割集初始大小
do {
oldSize = partitions.size();
for (InputCharType input = static_cast<InputCharType>(0); input < EPSILON; input = static_cast<InputCharType>(input + 1)) {//类似于求Ia,Ib等
for (Partition* partition : set<Partition*>(partitions)) {//遍历现存分割的每一个割集,看是否可再分割
if (partition->states.size() > 1) {//为1的集合不可再分割
split(partition->states, input, partitions);//核心分割函数
}
}
}
} while (partitions.size() != oldSize);//当割集集合大小不再变化时停止
// 创建新的最小化 DFA即重新映射dfa重新编号状态
// 构造DFA参数为DFA(State* set<State*, StatePtrCompare> set<State*>set<State*, StatePtrCompare> set<State*> states)
set<State*, StatePtrCompare> minimizedStates;
set<State*, StatePtrCompare> minimizedEndStates;
State* minimizedStartState = nullptr;
map<State*, State*> stateMap;
for (Partition* partition : partitions) {//遍历获得的每个割集
State* newState = new State(minimizedStates.size());//编号
// 检查当前划分是否包含旧DFA的开始状态如果是则将新状态设置为最小化DFA的开始状态
if (partition->states.find(dfa.startState) != partition->states.end()) {
minimizedStartState = newState;
}
// 如果划分的状态集合不为空,选择一个代表状态
if (!partition->states.empty()) {
State* representative = *(partition->states.begin());//因为在前面终止状态都分到了不同割集且大小为1所以如果是终止状态begin已经可以代表了
//在分割状态集合的过程中,已经确保了一个划分中所有状态具有相同的属性,要么所有状态都是终止状态,要么都不是终止状态。所以我们只需要检查一个状态来确定新状态是否应该是终止状态。
// 如果代表状态是终止状态,则设置新状态为终止状态,并保留相应的单词类型
if (representative->isFinalState) {
newState->setFinalState(true, representative->wordType);
minimizedEndStates.insert(newState);
}
}
// 将集合里面所有旧状态映射到同一个新状态
for (State* state : partition->states)
{
stateMap[state] = newState;
}
// 将新状态插入到最小化DFA的状态集合中
minimizedStates.insert(newState);
}
// 遍历旧DFA中的所有状态
for (State* oldState : dfa.states) {
// 通过映射找到与旧状态对应的新状态
State* newState = stateMap[oldState];
for (const auto& transition : oldState->transitions) {
InputCharType input = transition.first;
State* oldTargetState = *(transition.second.begin());//dfa每个状态只有一个转移状态沿用了nfa的结构所以集合大小<=1
State* newTargetState = stateMap[oldTargetState];// 获取旧状态的目标状态
newState->addTransition(input, newTargetState);// 通过映射找到新的目标状态
}
}
// 清理并删除原始分区
for (Partition* partition : partitions) {
delete partition;
}
return DFA(minimizedStartState, minimizedEndStates, minimizedStates);
}
void removeUnreachableStates(DFA& dfa) {
set<State*> reachableStates; //可达状态集合
queue<State*> statesQueue; //状态队列
//将初始状态加入可达状态集合和队列
reachableStates.insert(dfa.startState);
statesQueue.push(dfa.startState);
// BFS 遍历 DFA找出所有可达状态
while (!statesQueue.empty()) {
State* currentState = statesQueue.front();
statesQueue.pop();
for (const auto& transition : currentState->transitions) {
State* targetState = *(transition.second.begin());//dfa每个状态只有一个转移状态沿用了nfa的结构所以集合大小<=1
if (reachableStates.find(targetState) == reachableStates.end()) {//若未访问
reachableStates.insert(targetState);
statesQueue.push(targetState);
}
}
}
// 删除所有不可达状态
for (auto it = dfa.states.begin(); it != dfa.states.end();) {
State* state = *it;
if (reachableStates.find(state) == reachableStates.end()) {//若当前状态不可达,删除
it = dfa.states.erase(it);
delete state;
}
else {
++it;
}
}
}
vector<string> recognize(const DFA& dfa, const string& input, const string& output) {
State* currentState = dfa.startState;
State* nextState = nullptr;
string buffer;
vector<string> tokens; // 用于收集识别到的Token
//打开结果输出文件
ofstream file(output);
if (!file.is_open()) {
cout << "Error opening file!" << endl;
return tokens;
}
for (size_t i = 0; i < input.length(); ++i) {
char c = input[i];
if (c == ' '||c=='\n'||c=='\r\n'||c==' ')// 如果是空格、换行等分隔符,则跳过
{continue; }
InputCharType inputCharType = getInputCharType(c);
auto it = currentState->transitions.find(inputCharType);
if (it != currentState->transitions.end()) {
nextState = *(it->second.begin());
buffer.push_back(c);
if (nextState->isFinalState && i + 1 < input.length()) {// 如果下一个状态是终止状态并且还有剩余字符
char nextChar = input[i + 1];
InputCharType nextInputCharType = getInputCharType(nextChar);
auto nextIt = nextState->transitions.find(nextInputCharType);// 查找下一个状态的转换表中是否有对应的输入字符类型
if (nextIt == nextState->transitions.end()) {// 如果没有更多匹配的转换
// 输出识别到的单词符号和对应的类型
cout << buffer << "\t<" << getWordTypeName(nextState->wordType,buffer) << ">" << endl;
file << buffer << "\t<" << getWordTypeName(nextState->wordType, buffer) << ">" << endl;
tokens.push_back(getGrammarName(nextState->wordType, buffer));
buffer.clear();
currentState = dfa.startState;
}
else {
currentState = nextState;// 更新当前状态为下一个状态
}
}
else {
currentState = nextState;// 更新当前状态为下一个状态
}
}
else {// 如果没有找到匹配的转换
if (currentState->isFinalState) {// 如果当前状态是终止状态
// 输出识别到的单词符号和对应的类型
cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) << ">" << endl;
file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ">" << endl;
tokens.push_back(getGrammarName(currentState->wordType, buffer) );
buffer.clear();
}
else {
// 如果当前状态不是终止状态
// 输出无法识别的字符信息
cout << "Unrecognized characters: " << c << endl;
file << "Unrecognized characters: " << c << endl;
buffer.clear();
}
currentState = dfa.startState;// 回到起始状态
//--i;// 重新处理当前字符,还是跳过吧,这里可以添加错误处理
}
}
// 处理最后一个字符,如果缓冲区不为空且当前状态是终止状态,对应第一个if里面的else
if (!buffer.empty() && currentState->isFinalState) {
cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) << ">" << endl;
file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ">" << endl;
tokens.push_back(getGrammarName(currentState->wordType, buffer));
}
file.close();//关闭文件
return tokens;
}

262
nfa/nfa.cpp Normal file
View File

@ -0,0 +1,262 @@
// 将正则表达式转换为非确定性有限自动机
#include "nfa.h"
// 处理正则表达式,描述终态
NFA RexToNFA() {
//由于里面存在||,所以不同正则间使用空格分隔代表| l代表letter_代表下划线0代表数字(也可以是d但是为了使用已经有的函数)
//[lu]代表l|u
string rex = "+ - * / % = > < == <= >= != && || ( ) { } , ; [l_][l_0]* -?00*";
//下面给出正则对应的输出(终态)
vector<WordType> finalState = {
OP_ADD, OP_SUB,OP_MUL,OP_DIV,OP_MOD,OP_ASSIGN,OP_GT,OP_LT, OP_EQ,OP_LE,OP_GE,OP_NE, OP_AND, OP_OR,SE_LBRAC, SE_RBRAC,
SE_LCBRAC,SE_RCBRAC,SE_COMMA,SE_SEMI,IDN,INT_VAL
};
stringstream ss(rex);
string target;
// 创建初始状态
int stateIndex = 0;
int finalIndex = 0;
State* startState = new State(stateIndex++);
set<State*, StatePtrCompare> endStates;
set<State*, StatePtrCompare> allStates = { startState };
while (getline(ss, target,' ')) {
//如获得[l_][l_0]*
State* currentState = startState;
for (size_t i = 0; i < target.length();i++) {
//创建一个新状态startState通过输入InputCharType到达该状态
State* newState = new State(stateIndex++);
allStates.insert(newState);
//需要往后看一个符号
if (target[i] == '[') {
//[...]构成一种输入,查看]后面是否有?或者*,来判断当前状态的构成
for (i=i+1; i < target.length() && target[i] != ']'; i++) {
InputCharType input = getInputCharType(target[i]);
if (input != EPSILON) {
// 添加转移函数,从当前状态向新状态转移
currentState->addTransition(input, newState);
}
}
}
else {
InputCharType input = getInputCharType(target[i]);
currentState->addTransition(input, newState);
}
//往后查看一个输入
if (i + 1 < target.length() && target[i + 1] == '?') {
//创建EPSILON转移状态
State* epsState = new State(stateIndex++);
allStates.insert(epsState);
currentState->addTransition(EPSILON, epsState);
newState->addTransition(EPSILON, epsState);
currentState = epsState;
// 跳过'?'字符
i++;
}
else if (i + 1 < target.length() && target[i + 1] == '*') {
State* epsState = new State(stateIndex++);
allStates.insert(epsState);
currentState->addTransition(EPSILON, epsState);
newState->addTransition(EPSILON, epsState);
epsState->addTransition(EPSILON, currentState);
currentState = epsState;
// 跳过'*'字符
i++;
}
else {
currentState = newState;
}
//判断是否是终止状态
if (i == (target.length() - 1)) {
// 到达最后一个字符,将当前状态设置为终止状态
currentState->setFinalState(true, finalState[endStates.size()]);
endStates.insert(currentState);
}
}//for
}
// 返回字符集合对应的NFA
return NFA(startState, endStates, allStates);
}
// 构造状态机
NFA buildNFA(string filename) {
ifstream ifs(filename);
if (!ifs) {
cerr << "Cannot open file: " << filename << endl;
exit(EXIT_FAILURE);
}
int stateNum, inputNum;
ifs >> stateNum >> inputNum;
vector<State*> states(stateNum);
for (int i = 0; i < stateNum; i++) {
states[i] = new State(i);
}
State* startState = states[0];
set<State*, StatePtrCompare> endStates;
for (int i = 0; i < stateNum; i++) {
for (int j = 0; j < inputNum; j++) {
string targetStateIDs;
ifs >> targetStateIDs;
if (targetStateIDs.compare("#") != 0) {
stringstream ss(targetStateIDs);
string targetStateIDStr;
while (getline(ss, targetStateIDStr, ',')) {
int targetStateID = stoi(targetStateIDStr);
states[i]->addTransition(static_cast<InputCharType>(j), states[targetStateID]);
}
}
}
}
int endStateNum;
ifs >> endStateNum;
for (int i = 0; i < endStateNum; i++) {
int endStateID, wordTypeID;
ifs >> endStateID >> wordTypeID;
states[endStateID]->setFinalState(true, static_cast<WordType>(wordTypeID));
endStates.insert(states[endStateID]);
}
return NFA(startState, endStates, set<State*, StatePtrCompare>(states.begin(), states.end()));
}
void printNFA(const NFA& nfa) {
cout << "Start state: " << nfa.startState->id << endl;
cout << "End states: "<<endl;
for (auto state : nfa.endStates) {
cout << state->id << " " << getWordTypeName(state->wordType) << " " << (state->isFinalState == true) << endl;
}
cout << endl;
cout << "States and transitions:" << endl;
for (auto state : nfa.states) {
cout << "State " << state->id << ":" << endl;
for (auto transition : state->transitions) {
cout << "\tInput " << getInputChartypeName(transition.first) << ": ";
for (auto targetState : transition.second) {
cout << targetState->id << " ";
}
cout << endl;
}
}
}
set<State*, StatePtrCompare> move(const set<State*, StatePtrCompare>& states, InputCharType input) {
set<State*, StatePtrCompare> targetStates;
for (State* state : states) {
auto it = state->transitions.find(input);
if (it != state->transitions.end()) {
for (State* targetState : it->second) {
if (targetStates.find(targetState) == targetStates.end()) {
targetStates.insert(targetState);
}
}
}
}
return targetStates;
}
set<State*, StatePtrCompare> epsilonClosure(const set<State*, StatePtrCompare>& states) {
set<State*, StatePtrCompare> closure = states;
stack<State*> stateStack;
for (State* state : states) {
stateStack.push(state);
}
while (!stateStack.empty()) {
State* currentState = stateStack.top();
stateStack.pop();
auto it = currentState->transitions.find(EPSILON);
if (it != currentState->transitions.end()) {
for (State* nextState : it->second) {
if (closure.find(nextState) == closure.end()) {//防止同一状态多次进栈set自带去重
closure.insert(nextState);
stateStack.push(nextState);
}
}
}
}
return closure;
}
DFA nfaToDFA(const NFA& nfa) {
map<set<State*, StatePtrCompare>, State*, SetComparator> dfaStatesMap; // 用于映射NFA状态集合到DFA状态的映射表
queue<set<State*, StatePtrCompare>> nfaStatesQueue; // 用于BFS遍历的集合队列
set<State*, StatePtrCompare> dfaStates;
set<State*, StatePtrCompare> dfaEndStates;
set<State*, StatePtrCompare> nfaStartClosure = epsilonClosure({ nfa.startState });
State* dfaStartState = new State(0);
dfaStatesMap[nfaStartClosure] = dfaStartState;
dfaStates.insert(dfaStartState);
nfaStatesQueue.push(nfaStartClosure);
int nextStateId = 1;
//set<State*, StatePtrCompare> nfaStartClosure
while (!nfaStatesQueue.empty()) {
set<State*, StatePtrCompare> currentNFAStates = nfaStatesQueue.front();
nfaStatesQueue.pop();
State* currentDFAState = dfaStatesMap[currentNFAStates];
// 检查是否有终止状态如果有设置DFA状态为终止状态
for (State* nfaState : currentNFAStates) {
if (nfaState->isFinalState) {
// cout << nfaState->id << "is FinalState" << endl;
currentDFAState->setFinalState(true, nfaState->wordType);
dfaEndStates.insert(currentDFAState);
break;
}
}
// 遍历所有输入字符类型
for (int i = 0; i < static_cast<int>(EPSILON); i++) {
InputCharType inputCharType = static_cast<InputCharType>(i);
set<State*, StatePtrCompare> nextNFAStates = epsilonClosure(move(currentNFAStates, inputCharType));
if (nextNFAStates.empty()) {
continue;
}
// 如果NFA状态集合不存在于映射表中则创建新的DFA状态
if (dfaStatesMap.find(nextNFAStates) == dfaStatesMap.end()) {
State* newDFAState = new State(nextStateId++);
dfaStatesMap[nextNFAStates] = newDFAState;
dfaStates.insert(newDFAState);
nfaStatesQueue.push(nextNFAStates);
}
currentDFAState->addTransition(inputCharType, dfaStatesMap[nextNFAStates]);
}
}
return DFA(dfaStartState, dfaEndStates, dfaStates);
}
void printDFA(const DFA& dfa) {
cout << "Start state: " << dfa.startState->id << endl;
cout << "End states: "<<endl;
for (auto state : dfa.endStates) {
cout << state->id << " " << getWordTypeName(state->wordType) << endl;
}
cout << endl;
cout << "States and transitions:" << endl;
for (auto state : dfa.states) {
cout << "State " << state->id << ":" << endl;
for (auto transition : state->transitions) {
cout << "\tInput " << getInputChartypeName(transition.first) << ": ";
for (auto targetState : transition.second) {
cout << targetState->id << " ";
}
cout << endl;
}
}
}

173
nfa/nfa.h Normal file
View File

@ -0,0 +1,173 @@
#pragma once
#ifndef __NFA__H__
#define __NFA__H__
#include <map>
#include <set>
#include <deque>
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#include <stdio.h>
#include <sstream>
#include <stack>
#include <queue>
#include <algorithm>
using namespace std;
//单词符号的类型,返回<待测代码中的单词符号,WordType>
typedef enum WordType {
//当识别成标识符后先判断是不是保留字让后再判断IDN
KW_INT = 0, // int
KW_VOID, // void
KW_RETURN, // return
KW_CONST, // const
OP_ADD, // +
OP_SUB, // -
OP_MUL, // *
OP_DIV, // /
OP_MOD, // %
OP_ASSIGN, // =
OP_GT, // >
OP_LT, // <
OP_EQ, // ==
OP_LE, // <=
OP_GE, // >=
OP_NE, // !=
OP_AND, // &&
OP_OR, // ||
SE_LBRAC, // ( left backet
SE_RBRAC, // ) right bracket
SE_LCBRAC, // { left curly bracket
SE_RCBRAC, // } right curly bracket
SE_COMMA, // ,
SE_SEMI, // ;
IDN, // [a-zA-Z][a-zA-Z_0-9]*
INT_VAL, // -*[0-9]+
UNKOWN
}WordType;
string getWordTypeName(WordType type);
//定义输入的字符类别
typedef enum InputCharType {
LETTER = 0, // 字母 0
UNDERLINE, // _ 1
DIGIT, // 数字 2 当识别成功一个数字时为了避免出现数字01的情况返回前先进行一个判断对GCC01可以识别并等于1的
//OP
ADD, // + 3
SUB, // - 4
MUL, // * 5
DIV, // / 6
MOD, // % 7
EQ, // = 8
GT, // > 9
LT, // < 10
NOT, // ! 11
AND, // & 12
OR, // | 13
//SE
LBRACKET, // ( 14
RBRACKET, // ) 15
LCBRAC, // { 16
RCBRAC, // } 17
COMMA, // , 18
SEMI, // ; 19
EPSILON, // 空字符 20
}InputCharType;
string getInputChartypeName(InputCharType type);
enum class TokenType {
KW = 0,
OP,
SE,
IDN,
INT,
UNKNOWN
};
TokenType getTokenType(WordType wordType,string buffer);
typedef struct Token {
string value;
TokenType type;
} Token;
//定义函数判断输入的字符类别
InputCharType getInputCharType(char c);
string getWordTypeName(WordType type,string buffer);
//定义状态类
class State {
public:
int id; // 状态编号
map<InputCharType, set<State*>> transitions; // 转移函数映射表,记录每个输入字符类型对应的目标状态集合
bool isFinalState; // 是否为最终状态
WordType wordType; // 到达该状态时应该返回的词法单元类型
State(int id) : id(id), isFinalState(false), wordType(UNKOWN) {}
void addTransition(InputCharType input, State* targetState) {
transitions[input].insert(targetState);
}
void setFinalState(bool isFinal, WordType type) {
isFinalState = isFinal;
wordType = type;
}
bool operator<(const State& other) const {
return id < other.id;
}
};
//为了是set内部有序定义排序结构体StatePtrCompare
struct StatePtrCompare {
bool operator()(const State* lhs, const State* rhs) const {
return lhs->id < rhs->id;
}
};
//定义NFA类
class NFA {
public:
State* startState; // 起始状态
set<State*, StatePtrCompare> endStates; // 终止状态集合
set<State*, StatePtrCompare> states; // 状态集合
NFA(State* startState, set<State*, StatePtrCompare> endStates, set<State*, StatePtrCompare> states) :
startState(startState), endStates(endStates), states(states) {}
// void printNFA();
};
NFA RexToNFA();
void printNFA(const NFA& nfa);
NFA buildNFA(string filename);
NFA RexToNFA();
set<State*, StatePtrCompare> move(const set<State*, StatePtrCompare>& states, InputCharType input);
set<State*, StatePtrCompare> epsilonClosure(const set<State*, StatePtrCompare>& states);
class DFA {
public:
State* startState; // 起始状态
set<State*, StatePtrCompare> endStates; // 终止状态集合
set<State*, StatePtrCompare> states; // 状态集合
DFA(State* startState, set<State*, StatePtrCompare> endStates, set<State*, StatePtrCompare> states) :
startState(startState), endStates(endStates), states(states) {}
};
void removeUnreachableStates(DFA& dfa);
void printDFA(const DFA& dfa);
DFA nfaToDFA(const NFA& nfa);
void printDFA(const DFA& dfa);
struct SetComparator {
bool operator()(const set<State*, StatePtrCompare>& a, const set<State*, StatePtrCompare>& b) const {
if (a.size() != b.size()) {
return a.size() < b.size();
}
vector<State*> vecA(a.begin(), a.end());
vector<State*> vecB(b.begin(), b.end());
sort(vecA.begin(), vecA.end(), [](const State* a, const State* b) { return a->id < b->id; });
sort(vecB.begin(), vecB.end(), [](const State* a, const State* b) { return a->id < b->id; });
return vecA < vecB;
}
};
string getGrammarName(WordType type, string buffer);
DFA minimizeDFA(const DFA& dfa);
vector<string> recognize(const DFA& dfa, const string& input, const string& output);
string readfile(const string& filename);
#endif

287
nfa/tool.cpp Normal file
View File

@ -0,0 +1,287 @@
#include "nfa.h"
InputCharType getInputCharType(char c) {
switch (c) {
case '_': return UNDERLINE;
case '+': return ADD;
case '-': return SUB;
case '*': return MUL;
case '/': return DIV;
case '%': return MOD;
case '=': return EQ;
case '>': return GT;
case '<': return LT;
case '!': return NOT;
case '&': return AND;
case '|': return OR;
case '(': return LBRACKET;
case ')': return RBRACKET;
case '{': return LCBRAC;
case '}': return RCBRAC;
case ',': return COMMA;
case ';': return SEMI;
default:
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
return LETTER;
}
else if (c >= '0' && c <= '9') {
return DIGIT;
}
else {
return EPSILON;
}
}
}
string getInputChartypeName(InputCharType type) {
switch (type)
{
case LETTER:
return "LETTER";
case UNDERLINE:
return "UNDERLINE";
case DIGIT:
return "DIGIT";
case ADD:
return "+";
case SUB:
return "-";
case MUL:
return "*";
case DIV:
return "/";
case MOD:
return "%";
case EQ:
return "=";
case GT:
return ">";
case LT:
return "<";
case NOT:
return "!";
case AND:
return "&";
case OR:
return "|";
case LBRACKET:
return "(";
case RBRACKET:
return ")";
case LCBRAC:
return "{";
case RCBRAC:
return "}";
case COMMA:
return ",";
case SEMI:
return ";";
case EPSILON:
return "EPSILON";
default:
return "UNKOWN";
}
}
string getWordTypeName(WordType type, string buffer) {
switch (type) {
case OP_ADD:
case OP_SUB:
case OP_MUL:
case OP_DIV:
case OP_MOD:
case OP_ASSIGN:
case OP_GT:
case OP_LT:
case OP_EQ:
case OP_LE:
case OP_GE:
case OP_NE:
case OP_AND:
case OP_OR:
return "OP";
case SE_LBRAC:
case SE_RBRAC:
case SE_LCBRAC:
case SE_RCBRAC:
case SE_COMMA:
case SE_SEMI:
return "SE";
case IDN:
if (!buffer.compare("int") || !buffer.compare("void") || !buffer.compare("const") || !buffer.compare("return")){
return "KW";
}
else {
return "IDN";
}
case INT_VAL:
return "INT";
default:
return "UNKNOWN";
}
}
string readfile(const string& filename)
{
// 打开文件流并读取文件内容
ifstream file(filename);
string content((istreambuf_iterator<char>(file)),
istreambuf_iterator<char>());
// 去掉换行符
//remove函数的作用是将字符串中的某个字符移动到字符串的末尾并返回一个指向该字符后面位置的指针。
//erase 函数的作用是删除字符串中指定区间内的所有字符,返回修改后的字符串
//content.erase(remove(content.begin(), content.end(), '\n'), content.end());
return content;
}
TokenType getTokenType(WordType type,string buffer) {
switch (type) {
case OP_ADD:
case OP_SUB:
case OP_MUL:
case OP_DIV:
case OP_MOD:
case OP_ASSIGN:
case OP_GT:
case OP_LT:
case OP_EQ:
case OP_LE:
case OP_GE:
case OP_NE:
case OP_AND:
case OP_OR:
return TokenType::OP;
case SE_LBRAC:
case SE_RBRAC:
case SE_LCBRAC:
case SE_RCBRAC:
case SE_COMMA:
case SE_SEMI:
return TokenType::SE;
case IDN:
if (!buffer.compare("int") || !buffer.compare("void") || !buffer.compare("const") || !buffer.compare("return")) {
return TokenType::KW;
}
else {
return TokenType::IDN;
}
case INT_VAL:
return TokenType::INT;
default:
return TokenType::UNKNOWN;
}
}
string getWordTypeName(WordType type) {
switch (type) {
case KW_INT:
return "KW_INT";
case KW_VOID:
return "KW_VOID";
case KW_RETURN:
return "KW_RETURN";
case KW_CONST:
return "KW_CONST";
case OP_ADD:
return "OP_ADD";
case OP_SUB:
return "OP_SUB";
case OP_MUL:
return "OP_MUL";
case OP_DIV:
return "OP_DIV";
case OP_MOD:
return "OP_MOD";
case OP_ASSIGN:
return "OP_ASSIGN";
case OP_GT:
return "OP_GT";
case OP_LT:
return "OP_LT";
case OP_EQ:
return "OP_EQ";
case OP_LE:
return "OP_LE";
case OP_GE:
return "OP_GE";
case OP_NE:
return "OP_NE";
case OP_AND:
return "OP_AND";
case OP_OR:
return "OP_OR";
case SE_LBRAC:
return "SE_LBRAC";
case SE_RBRAC:
return "SE_RBRAC";
case SE_LCBRAC:
return "SE_LCBRAC";
case SE_RCBRAC:
return "SE_RCBRAC";
case SE_COMMA:
return "SE_COMMA";
case SE_SEMI:
return "SE_SEMI";
case IDN:
return "IDN";
case INT_VAL:
return "INT_VAL";
default:
return "UNKNOWN";
}
}
string getGrammarName(WordType type, string buffer) {
switch (type) {
case OP_ADD: return "+";
case OP_SUB: return "-";
case OP_MUL: return "*";
case OP_DIV: return "/";
case OP_MOD: return "%";
case OP_ASSIGN: return "=";
case OP_GT: return ">";
case OP_LT: return "<";
case OP_EQ: return "==";
case OP_LE: return "<=";
case OP_GE: return ">=";
case OP_NE: return "!=";
case OP_AND: return "&&";
case OP_OR: return "||";
case SE_LBRAC: return "(";
case SE_RBRAC: return ")";
case SE_LCBRAC: return "{";
case SE_RCBRAC: return "}";
case SE_COMMA: return ",";
case SE_SEMI: return ";";
case IDN:
if (!buffer.compare("int")) {
return "int";
}
else if (!buffer.compare("void")) {
return "void";
}
else if (!buffer.compare("return")) {
return "return";
}
else if (!buffer.compare("const")) {
return "const";
}
else {
return "IDN";
}
case INT_VAL: return "INT";
default: cerr << "Token Error: "<< type << endl; exit(-1);
}
}