commit d6861884f28c1ff326e233a1bbc14b777c4fd493 Author: LiuYuanchi Date: Sun May 5 21:51:08 2024 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13a0149 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*/build/ +build/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8f4d0b4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.10) +project(compiler-bin) + +# 收集所有的cpp源文件 +file(GLOB SOURCES_LL "LL1/*.cpp") +file(GLOB SOURCES_NFA "nfa/*.cpp") +file(GLOB SOURCES_MAIN "main/*.cpp") + +# 设置输出目录为 bin +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +# 创建静态链接库 +add_library(LL STATIC ${SOURCES_LL}) +add_library(nfa STATIC ${SOURCES_NFA}) + +# 添加头文件目录 +target_include_directories(LL PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/LL) +target_include_directories(nfa PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/nfa) + +# 添加可执行文件 +add_executable(main ${SOURCES_MAIN}) + +# 链接静态库 +target_link_libraries(main PRIVATE LL nfa) diff --git a/LL1/CMakeLists.txt b/LL1/CMakeLists.txt new file mode 100644 index 0000000..22471ff --- /dev/null +++ b/LL1/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.10) +project(LL) + +# 收集所有的cpp源文件 +file(GLOB SOURCES "*.cpp") + +# 设置输出目录为 bin +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +# 创建静态链接库 +add_library(LL STATIC ${SOURCES}) + +# 添加头文件目录 +target_include_directories(LL PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) \ No newline at end of file diff --git a/LL1/LL1.cpp b/LL1/LL1.cpp new file mode 100644 index 0000000..f057f29 --- /dev/null +++ b/LL1/LL1.cpp @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "LL1.h" + + +LL1::LL1() +{ + read_grammar(); + init_grammar_set(); +} + +LL1::~LL1() +{ +} + +bool LL1::IsLL1() +{ + string symbol; + vector right_first = vector(); + vector left_follow; + for (int i = 0; i < grammar_rules.size(); i++) { + symbol.clear(); + right_first.clear(); + left_follow.clear(); + + symbol = grammar_rules[i].first; + + + + // 计算 产生式左侧 FOLLOW 集 + left_follow = follow[symbol]; + + // 计算 产生式右侧 FIRST 集 + + // 对 X1 的 非 $ 符号 加入 + for (int j = 0; j < first[grammar_rules[i].second[0]].size(); j++) { + if (first[grammar_rules[i].second[0]][j] == "$") { + continue; + } + right_first.push_back(first[grammar_rules[i].second[0]][j]); + } + + int cnt; + for (cnt = 1; cnt < grammar_rules[i].second.size(); cnt++) { + + // 当且仅当 有 $ 符号时 继续加入 + if (!infer_empty[grammar_rules[i].second[cnt - 1]]) { + break; + } + for (int j = 0; j < first[grammar_rules[i].second[cnt]].size(); j++) { + if (first[grammar_rules[i].second[cnt]][j] == "$") { + continue; + } + right_first.push_back(first[grammar_rules[i].second[cnt]][j]); + } + } + + // 若都能推导至 $ 符号时 加入 + if (cnt == grammar_rules[i].second.size() && infer_empty[grammar_rules[i].second[0]]) { + right_first.push_back("$"); + } + + // 对产生式右侧 FIRST 集 进行 去重 + set sright_first(right_first.begin(), right_first.end()); + right_first.clear(); + right_first.resize(sright_first.size()); + right_first.assign(sright_first.begin(), sright_first.end()); + + + + vector symbol_select; + + // 若产生式右侧 FIRST 集为 {$} 时 + if (right_first.size() == 1 && right_first[0] == "$") { + + // SELECT 集为 产生式右侧 FOLLOW 集 与 {$} 的交集 + symbol_select = left_follow; + if (find(left_follow.begin(), left_follow.end(), "$") == left_follow.end()) { + symbol_select.push_back("$"); + } + } + else + { + // SELECT 集为 产生式左侧 FIRST 集 + symbol_select = right_first; + } + + // 对 SELECT 集 进行排序 方便接下来进行集合运算 + sort(symbol_select.begin(), symbol_select.end()); + + vector new_select = vector(); + + // 判断 SELECT 表中有无现有数据 + if (select.find(symbol) == select.end()) { + + select[symbol] = symbol_select; + } + else { + + // 判断两个相同产生式左侧 SELECT 集 是否相交 + set_intersection(symbol_select.begin(), symbol_select.end(), select[symbol].begin(), select[symbol].end(), back_inserter(new_select)); + + if (new_select.size() == 0) { + // 不相交,继续运算,存入两者并集 + set_union(symbol_select.begin(), symbol_select.end(), select[symbol].begin(), select[symbol].end(), back_inserter(new_select)); + } + else + { + // 非 LL(1) 文法,退出 + cout << "This grammar is not LL (1) grammar" << endl; + return false; + } + + } + + } + + // cout << "该文法为 LL(1) 文法!" << endl; + return true; +} + +void LL1::build_LL1_predict() +{ + // 对每一个 非终结符 进行初始化行 + for (int i = 0; i < VNs.size(); i++) { + if (LL1_predict.find(VNs[i]) == LL1_predict.end()) { + LL1_predict[VNs[i]] = unordered_map(); + } + } + + string symbol; + vector right_first = vector(); + vector left_follow; + + // 遍历 产生式 构建 预测分析表 + for (int i = 0; i < grammar_rules.size(); i++) { + symbol.clear(); + right_first.clear(); + left_follow.clear(); + + symbol = grammar_rules[i].first; + + + // 计算 产生式左侧 FOLLOW 集 + left_follow = follow[symbol]; + + unordered_map &symbol_predict = LL1_predict[symbol]; + + + // 计算 产生式右侧 FIRST 集 + + // 对 X1 的 非 $ 符号 加入 + for (int j = 0; j < first[grammar_rules[i].second[0]].size(); j++) { + if (first[grammar_rules[i].second[0]][j] == "$") { + continue; + } + right_first.push_back(first[grammar_rules[i].second[0]][j]); + } + + int cnt; + for (cnt = 1; cnt < grammar_rules[i].second.size(); cnt++) { + + // 当且仅当 有 $ 符号时 继续加入 + if (!infer_empty[grammar_rules[i].second[cnt - 1]]) { + break; + } + for (int j = 0; j < first[grammar_rules[i].second[cnt]].size(); j++) { + if (first[grammar_rules[i].second[cnt]][j] == "$") { + continue; + } + right_first.push_back(first[grammar_rules[i].second[cnt]][j]); + } + } + + // 若都能推导至 $ 符号时 加入 + if (cnt == grammar_rules[i].second.size() && infer_empty[grammar_rules[i].second[0]]) { + right_first.push_back("$"); + } + + // 对产生式右侧 FIRST 集 进行 去重 + set sright_first(right_first.begin(), right_first.end()); + right_first.clear(); + right_first.resize(sright_first.size()); + right_first.assign(sright_first.begin(), sright_first.end()); + + // 循环遍历 FIRST 集进行初始化 + for (int j = 0; j < right_first.size(); j++) { + if (right_first[j] == "$") { + pair> new_rule (grammar_rules[i].first, vector()); + new_rule.second.push_back("$"); + int rule_id = insert_rule(new_rule); + + for (int k = 0; k < left_follow.size(); k++) { + symbol_predict[left_follow[k]] = rule_id; + } + } + symbol_predict[right_first[j]] = i; + + } + + } + + +} + +void LL1::print_LL1_predict() +{ + cout << "[LL1_predict]:" << endl; + for (auto iter = LL1_predict.begin(); iter != LL1_predict.end(); ++iter) { + cout << (*iter).first << " "; + for (auto j = (*iter).second.begin(); j != (*iter).second.end(); ++j) { + cout << (*j).first << "," << (*j).second << " "; + } + cout << endl; + + } + cout << endl << endl; + +} + +void LL1::build_LL1_grammar() +{ + // 符号栈 + stack stack; + int token_cnt = 0; + + // 起始符 入栈 + stack.push(start); + + while (!stack.empty()) + { + LL1_grammar_log.push_back(string()); + + // 栈顶符号 + // 判断栈顶是否为 空符号 + if (stack.top() == "$") { + // 栈空 以 EOF 表示 + LL1_grammar_log.back() += "EOF"; + } + else + { + LL1_grammar_log.back() += stack.top(); + } + + // 添加 # 分割 + LL1_grammar_log.back() += "#"; + + // 面临输入的符号 + string this_token; + if (token_cnt == token_strings.size()) { + // 栈空 以 EOF 表示 + this_token = "$"; + LL1_grammar_log.back() += "EOF"; + } + else + { + this_token = token_strings[token_cnt]; + LL1_grammar_log.back() += token_strings[token_cnt]; + } + + // 对栈顶元素与即将输入的符号进行比较 + if (stack.top() == this_token) { + // 栈顶出栈 token 指向下一位 + token_cnt++; + stack.pop(); + + if (this_token == "$") { + // 分析成功 结束分析 + LL1_grammar_log.back() += "\taccept"; + } + else + { + // 跳过 + LL1_grammar_log.back() += "\tmove"; + } + } + // 若为终结符 + else if (find(VTs.begin(), VTs.end(), stack.top()) != VTs.end()) { + if (stack.top() == "$") { + stack.pop(); + LL1_grammar_log.pop_back(); + } + else { + LL1_grammar_log.back() += "\terror"; + return; + } + } + else + { + auto tab = LL1_predict[stack.top()]; + + if (tab.find(this_token) == tab.end()) { + LL1_grammar_log.back() += "\terror"; + return; + } + else + { + auto this_rule = grammar_rules[tab[this_token]]; + stack.pop(); + for (int i = this_rule.second.size() - 1; i >= 0; i--) { + stack.push(this_rule.second[i]); + } + LL1_grammar_log.back() += "\treduction"; + } + } + } + +} + +void LL1::print_LL1_grammar_log() +{ + for (int i = 0; i < LL1_grammar_log.size(); ++i) { + cout << LL1_grammar_log[i] << endl; + } +} + +void LL1::fileout_LL1_grammar_log(string file_name) +{ + //打开结果输出文件 + fstream outfile(file_name); + + if (!outfile.is_open()) { + cout << "[FILEOUT] fail to open file" << endl; + } + + for (int i = 0; i < LL1_grammar_log.size(); ++i) { + outfile << LL1_grammar_log[i] << endl; + } + outfile.close(); +} + +int LL1::insert_rule(pair>& new_rule) +{ + int cnt; + for (cnt = 0; cnt < grammar_rules.size(); cnt++) { + // 当 产生式规则 中存在这条产生式时 返回序号 + if (grammar_rules[cnt].first == new_rule.first && grammar_rules[cnt].second == new_rule.second) { + return cnt; + } + } + // 若不存在 返回序号的同时加入 + grammar_rules.push_back(new_rule); + return cnt; +} + diff --git a/LL1/LL1.h b/LL1/LL1.h new file mode 100644 index 0000000..5eaba92 --- /dev/null +++ b/LL1/LL1.h @@ -0,0 +1,32 @@ +// LL1 语法分析器 +#ifndef LL1_H +#define LL1_H + +#include "grammar.h" + +using namespace std; + +class LL1:public Grammar{ +public: + LL1(); + ~LL1(); + + bool IsLL1(); // 判断该文法是否为 LL1 文法 + void build_LL1_predict(); // 构建 LL1 的预测分析表 + void print_LL1_predict(); // 打印 LL1 的预测分析表 + void build_LL1_grammar(); // 构建规约序列 + void print_LL1_grammar_log(); + void fileout_LL1_grammar_log(string file_name); + + +private: + unordered_map> select; // 计算符号的 SELECT 集合 + unordered_map> LL1_predict; // LL1 的预测分析表 + vector LL1_grammar_log; // 规约序列 + + int insert_rule(pair>& new_rule); // 增加新的规则 +}; + + + +#endif // !LL1_H diff --git a/LL1/grammar.cpp b/LL1/grammar.cpp new file mode 100644 index 0000000..2aefe3e --- /dev/null +++ b/LL1/grammar.cpp @@ -0,0 +1,520 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "grammar.h" + + +Grammar::Grammar() +{ + +} + +Grammar::~Grammar() +{ + +} + + + +void Grammar::read_grammar() { + fstream infile; + infile.open(grammar_file,ios::in); + if (!infile.is_open()) + { + cout << "[READ_GRAMMAR] fail to open file: "<< grammar_file << endl; + return; + } + + string buf; + string arrow = "->"; + string farrow; + bool start_flag = true; + string left; + string forms; + + while (!infile.eof()) { + // 清理 string + buf.clear(); + left.clear(); + forms.clear(); + farrow.clear(); + + + grammar_rules.push_back(pair>()); + getline(infile, buf); + + stringstream ss(buf); + + // 读取产生式左侧 + ss >> left; + grammar_rules.back().first = left; + symbols.push_back(left); + VNs.push_back(left); + + // 存储 start + if (start_flag) { + start = left; + start_flag = false; + } + + // 读取 -> 符号 并保证合法 + ss >> farrow; + if (farrow != arrow) { + cout << "Grammar reading error" << endl; + } + + // 读取产生式右侧 + while (ss >> forms) + { + grammar_rules.back().second.push_back(forms); + symbols.push_back(forms); + forms.clear(); + } + } + + // 符号集 和 非终结符 去重 + set ssymbols(symbols.begin(), symbols.end()); + symbols.clear(); + symbols.resize(ssymbols.size()); + symbols.assign(ssymbols.begin(), ssymbols.end()); + + set sVNs(VNs.begin(), VNs.end()); + VNs.clear(); + VNs.resize(sVNs.size()); + VNs.assign(sVNs.begin(), sVNs.end()); + + // 符号集 和 非终结符 排序 以保证差集的成功 + sort(symbols.begin(), symbols.end()); + sort(VNs.begin(), VNs.end()); + + // 取差集 得到终极符 + set_difference(symbols.begin(), symbols.end(), VNs.begin(), VNs.end(), back_inserter(VTs)); + + infile.close(); + +} + +void Grammar::print_grammar() +{ + cout << "[start]: " << endl << start << endl << endl; + + cout << "[VTs]:" << endl; + for (int i = 0; i < VTs.size(); i++) { + cout << VTs[i] << " "; + if (((i + 1) % 5) == 0) + cout << endl; + } + cout << endl << endl; + + cout << "[VNs]:" << endl; + for (int i = 0; i < VNs.size(); i++) { + cout << VNs[i] << " "; + if (((i + 1) % 5) == 0) + cout << endl; + } + cout << endl << endl; + + cout << "[symbols]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << symbols[i] << " "; + if (((i + 1) % 5) == 0) + cout << endl; + } + cout << endl << endl; + + cout << "[grammar_rules]: " << grammar_rules.size() << endl; + for (int i = 0; i < grammar_rules.size(); ++i) { + cout << grammar_rules[i].first << " -> "; + for (int j = 0; j < grammar_rules[i].second.size(); ++j) { + cout << "\"" << grammar_rules[i].second[j] << "\" "; + } + cout << endl; + } + cout << endl << endl; +} + +void Grammar::expand_grammar() +{ + string new_start = start + "\'"; + pair> new_rule = pair>(new_start, vector()); + new_rule.second.push_back(start); + + VNs.push_back(new_start); + symbols.push_back(new_start); + grammar_rules.insert(grammar_rules.begin(), new_rule); + start = new_start; + + // 符号集排序 + sort(symbols.begin(), symbols.end()); + +} + +void Grammar::init_grammar_set() +{ + string symbol; + + + + // 对符号集中各符号进行推导 是否可以到达 $ 空符号 + for (int i = 0; i < symbols.size(); i++) { + symbol = symbols[i]; + this->symbol_infer_empty(symbol); + symbol.clear(); + } + + // 初始化符号在产生式的 出现 依赖 情况 + init_appears_depend(); + + // 对符号集中各符号进行推导 FIRST 集 + for (int i = 0; i < symbols.size(); i++) { + symbol = symbols[i]; + this->symbol_infer_first(symbol); + symbol.clear(); + } + + // 对符号集中各符号进行推导 FOLLOW 集 + + // 符号队列 + deque queue; + + // 初次遍历所有符号 生成初始的 FOLLOW 集 + + // 构建 start 的 FOLLOW 集 + follow[start] = this->symbol_infer_follow(start); + follow[start].push_back("$"); + queue.push_back(start); + + // 构建除 start 的 FOLLOW 集 + for (int i = 0; i < symbols.size(); i++) { + symbol = symbols[i]; + if (symbol == start) { + symbol.clear(); + continue; + } + follow[symbol] = this->symbol_infer_follow(symbol); + queue.push_back(symbol); + symbol.clear(); + } + + // 对 符号队列 进行进一步生成 + while (!queue.empty()) { + // 读取 符号队列 开头 + symbol = queue.front(); + queue.pop_front(); + + // 若 FOLLOW 集发生改变 + vector new_symbol_follow = this->symbol_infer_follow(symbol); + if (follow[symbol].size() < new_symbol_follow.size()) { + // 对依赖 该符号 的所有符号添加至 符号队列 + vector dep = depend[symbol]; + for (int i = 0; i < dep.size(); i++) { + queue.push_back(dep[i]); + } + follow[symbol] = new_symbol_follow; + } + symbol.clear(); + } + + +} + +void Grammar::print_grammar_set() +{ + // 打印符号在产生式的出现情况 + cout << "[left_appears]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << "LEFT( " << symbols[i] << " ) = {"; + for (int j = 0; j < left_appears[symbols[i]].size(); j++) { + cout << " " << left_appears[symbols[i]][j] << " "; + } + cout << "}" << endl; + } + cout << endl << endl; + + cout << "[right_appears]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << "RIGHT( " << symbols[i] << " ) = {"; + for (int j = 0; j < right_appears[symbols[i]].size(); j++) { + cout << " " << right_appears[symbols[i]][j] << " "; + } + cout << "}" << endl; + } + cout << endl << endl; + + // 打印 FOLLOW 集的依赖关系 + cout << "[depend]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << "DEPEND( " << symbols[i] << " ) = {"; + for (int j = 0; j < depend[symbols[i]].size(); j++) { + cout << " " << depend[symbols[i]][j] << " "; + } + cout << "}" << endl; + } + cout << endl << endl; + + + // 打印是否可以推导出 $ 空符号 + cout << "[infer_empty]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << symbols[i]<<" -> " << infer_empty[symbols[i]] << endl; + } + cout << endl << endl; + + // 打印 FIRST 集 + cout << "[FIRST]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << "FIRST( " << symbols[i] << " ) = {"; + for (int j = 0; j < first[symbols[i]].size(); j++) { + cout << " " << first[symbols[i]][j] << " "; + } + cout << "}" << endl; + } + cout << endl << endl; + + // 打印 FOLLOW 集 + cout << "[FOLLOW]:" << endl; + for (int i = 0; i < symbols.size(); i++) { + cout << "FOLLOW( " << symbols[i] << " ) = {"; + for (int j = 0; j < follow[symbols[i]].size(); j++) { + cout << " " << follow[symbols[i]][j] << " "; + } + cout << "}" << endl; + } + cout << endl << endl; + +} + +void Grammar::get_token_strings(vector& my_token_strings) +{ + token_strings.resize(my_token_strings.size()); + token_strings.assign(my_token_strings.begin(), my_token_strings.end()); + +} + +void Grammar::print_token_strings() +{ + for (int i = 0; i < token_strings.size(); i++) { + cout << token_strings[i] << endl; + } +} + +void Grammar::init_appears_depend() +{ + for (int k = 0; k < symbols.size(); k++) { + left_appears[symbols[k]] = vector(); + right_appears[symbols[k]] = vector(); + depend[symbols[k]] = vector(); + for (int i = 0; i < grammar_rules.size(); i++) { + if (grammar_rules[i].first == symbols[k]) { + // 产生式左侧相等 存入 left + left_appears[symbols[k]].push_back(i); + + // 对该产生式构建依赖关系 + for (int m = 0; m < grammar_rules[i].second.size(); m++) { + int n; + + // 判断该产生式右侧符号是否可以推导至 $ 空符号 + for (n = m + 1; n < grammar_rules[i].second.size(); n++) { + if (!infer_empty[grammar_rules[i].second[n]]) { + break; + } + } + // 若可以推导 按照入栈的方式依次加入 + if (n == grammar_rules[i].second.size()) { + if (symbols[k] != grammar_rules[i].second[m]) { + depend[symbols[k]].push_back(grammar_rules[i].second[m]); + } + } + + } + } + for (int j = 0; j < grammar_rules[i].second.size(); j++) { + // 产生式右侧相等 存入 left + if (grammar_rules[i].second[j] == symbols[k]) { + right_appears[symbols[k]].push_back(i); + break; + } + } + } + } + +} + +bool Grammar::symbol_infer_empty(const string& symbol) { + + // 已经进行推导过 + if (infer_empty.find(symbol) != infer_empty.end()) { + return infer_empty[symbol]; + } + + // 当符号为终结符时,当且仅当为 $ 可以推导出 $ + if (find(VTs.begin(), VTs.end(), symbol) != VTs.end()) { + infer_empty[symbol] = (symbol == "$") ; + return infer_empty[symbol]; + } + + // 当符号为非终结符时,通过产生式进行推导 + for (int i = 0; i < grammar_rules.size(); i++) { + // 当该符号为产生式左侧时 + if (grammar_rules[i].first == symbol) { + int j; + vector rule_right = grammar_rules[i].second; + for (j = 0; j < rule_right.size(); j++) { + // 递归推导 产生式右侧无法推导至 $ 时 + if (!(this->symbol_infer_empty(rule_right[j]))) { + break; + } + } + + // 当且仅当产生式右侧可以推导至 $ 时 + if (j == rule_right.size()) { + infer_empty[symbol] = true; + return infer_empty[symbol]; + } + } + } + + // 当各产生式都无法推导至 $ 时,则无法推导 + infer_empty[symbol] = false; + return infer_empty[symbol]; + +} + +vector Grammar::symbol_infer_first(const string& symbol) +{ + // 已经推导过 FIRST 集 + if (first.find(symbol) != first.end()) { + return first[symbol]; + } + + vector symbol_first; + + // 当符号为终结符时 FIRST 集为它本身 + if (find(VTs.begin(), VTs.end(), symbol) != VTs.end()) { + symbol_first.push_back(symbol); + first[symbol] = symbol_first; + return first[symbol]; + } + + // 当符号为非终结符时,通过产生式进行推导 + for (int i = 0; i < grammar_rules.size(); i++) { + // 当该符号为产生式左侧时 + if (grammar_rules[i].first == symbol) { + int j; + for (j = 0; j < grammar_rules[i].second.size(); j++) { + + // 依次添加所有产生式右侧的 + vector firsts = symbol_infer_first(grammar_rules[i].second[j]); + for (int k = 0; k < firsts.size(); k++) { + symbol_first.push_back(firsts[k]); + } + + // 若产生式右侧无法推导至 $ 空字符时 中断 + if (!infer_empty[grammar_rules[i].second[j]]) { + break; + } + + } + + // 当且仅当产生式右侧可以推导至 $ 时 将 $ 加入到 FIRST 集中 + if (j == grammar_rules[i].second.size()) { + symbol_first.push_back("$"); + } + } + } + + // 对当前 FIRST 集进行 去重 与 排序 + set ssymbol_first(symbol_first.begin(), symbol_first.end()); + symbol_first.clear(); + symbol_first.resize(ssymbol_first.size()); + symbol_first.assign(ssymbol_first.begin(), ssymbol_first.end()); + + sort(symbol_first.begin(), symbol_first.end()); + + // 返回非终结符的 FIRST 集 + first[symbol] = symbol_first; + return first[symbol]; +} + +vector Grammar::symbol_infer_follow(const string& symbol) +{ + vector symbol_follow; + + // 获取该符号出现在哪些产生式右侧 + vector right_appear = right_appears[symbol]; + for (int i = 0; i < right_appear.size(); i++) { + int cnt; + + // 获取该产生式右侧的符号 + vector rule_right = grammar_rules[right_appear[i]].second; + + // 依次遍历 该产生式右侧 至 该符号 后一位 + for (cnt = 0; cnt < rule_right.size(); cnt++) { + if (rule_right[cnt] == symbol) { + break; + } + } + cnt++; + + // 遍历 剩余产生式右侧 + for (; cnt < rule_right.size(); cnt++) { + + // 依次获取 后置元素 的 FIRST 集 + vector symbol_first = first[rule_right[cnt]]; + + // 将 该 FIRST 集 循环添加至 symbol_follow 中 + for (int j = 0; j < symbol_first.size(); j++) { + symbol_follow.push_back(symbol_first[j]); + } + + // 若不可达 $ 中断遍历 + if (!infer_empty[rule_right[cnt]]) { + break; + } + } + + // 当剩余产生式右侧均可到达 $ 时 + if (cnt == rule_right.size()) { + if (follow.find(grammar_rules[right_appear[i]].first) != follow.end()) { + + // 将产生式左侧的 FOLLOW 集 加入到 当前符号的 FOLLOW 集中 + vector first_follow = follow[grammar_rules[right_appear[i]].first]; + for (int j = 0; j < first_follow.size(); j++) { + symbol_follow.push_back(first_follow[j]); + } + + } + + } + + } + + // 删除不需要的 $ 空字符 + auto it = remove(symbol_follow.begin(), symbol_follow.end(), "$"); + auto it1 = symbol_follow.erase(it, symbol_follow.end()); + + + // 对当前 FOLLOW 集 进行去重排序 + set ssymbol_follow(symbol_follow.begin(), symbol_follow.end()); + symbol_follow.clear(); + symbol_follow.resize(ssymbol_follow.size()); + symbol_follow.assign(ssymbol_follow.begin(), ssymbol_follow.end()); + + sort(symbol_follow.begin(), symbol_follow.end()); + + + + return symbol_follow; +} + + + + + + + diff --git a/LL1/grammar.h b/LL1/grammar.h new file mode 100644 index 0000000..95e91b1 --- /dev/null +++ b/LL1/grammar.h @@ -0,0 +1,55 @@ +// 语法生成器 +#ifndef GRAMMAR_H +#define GRAMMAR_H + + +#include +#include +#include +#include +#include + +using namespace std; + +class Grammar +{ +public: + const string grammar_file = "./tests/grammar.txt"; + + Grammar(); + ~Grammar(); + void read_grammar(); // 读取语法规则 + void print_grammar(); // 打印语法规则 + void expand_grammar(); // 拓展语法规则 + void init_grammar_set(); // 初始化语法相关集合 + void print_grammar_set(); // 打印语法相关集合 + void get_token_strings(vector &); // 获取 token_stirngs + void print_token_strings(); + +protected: + vector>> grammar_rules; // 产生式规则 + string start; // 起始字符 + vector symbols; // 符号 + vector VTs; // 终结符 + vector VNs; // 非终结符 + unordered_map> first; // FIRST 集 + unordered_map> follow; // FOLLOW 集 + unordered_map infer_empty; // 是否可以推导出 $ 空字符 + vector token_strings; + + +private: + unordered_map> left_appears; // 该符号出现在哪些产生式左侧 + unordered_map> right_appears; // 该符号出现在哪些产生式右侧 + unordered_map> depend; // FOLLOW 集的依赖关系 + + + void init_appears_depend(); // 获取 appear depend 集合 + bool symbol_infer_empty(const string& symbol); // 判断符号是否可以推导出 $ 空字符 + vector symbol_infer_first(const string& symbol);// 推导符号的 FIRST 集 + vector symbol_infer_follow(const string& symbol);// 推导符号的 FOLLOW 集 + +}; + + +#endif // !GRAMMAR_H \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cbea495 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +三个模块 LL1 NFA MAIN +MAIN负责整合LL1和NFA + +每个模块基于 cmake 实现静态连接,cmakelist已经写好 +``` +mkdir build +cd build + +``` \ No newline at end of file diff --git a/bin/LL.lib b/bin/LL.lib new file mode 100644 index 0000000..b5d67a3 Binary files /dev/null and b/bin/LL.lib differ diff --git a/bin/nfa.lib b/bin/nfa.lib new file mode 100644 index 0000000..4e55d26 Binary files /dev/null and b/bin/nfa.lib differ diff --git a/main/CMakeLists.txt b/main/CMakeLists.txt new file mode 100644 index 0000000..3b9a75c --- /dev/null +++ b/main/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.10) +project(main) + +file(GLOB SOURCES "*.cpp") + +add_executable(main ${SOURCES}) + +# 链接静态库 +target_link_libraries(main PRIVATE ${CMAKE_BINARY_DIR}/../../bin/LL.lib) +target_link_libraries(main PRIVATE ${CMAKE_BINARY_DIR}/../../bin/nfa.lib) + + +# 添加头文件目录 +# target_include_directories(main PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/main/LL1.h b/main/LL1.h new file mode 100644 index 0000000..5eaba92 --- /dev/null +++ b/main/LL1.h @@ -0,0 +1,32 @@ +// LL1 语法分析器 +#ifndef LL1_H +#define LL1_H + +#include "grammar.h" + +using namespace std; + +class LL1:public Grammar{ +public: + LL1(); + ~LL1(); + + bool IsLL1(); // 判断该文法是否为 LL1 文法 + void build_LL1_predict(); // 构建 LL1 的预测分析表 + void print_LL1_predict(); // 打印 LL1 的预测分析表 + void build_LL1_grammar(); // 构建规约序列 + void print_LL1_grammar_log(); + void fileout_LL1_grammar_log(string file_name); + + +private: + unordered_map> select; // 计算符号的 SELECT 集合 + unordered_map> LL1_predict; // LL1 的预测分析表 + vector LL1_grammar_log; // 规约序列 + + int insert_rule(pair>& new_rule); // 增加新的规则 +}; + + + +#endif // !LL1_H diff --git a/main/grammar.h b/main/grammar.h new file mode 100644 index 0000000..95e91b1 --- /dev/null +++ b/main/grammar.h @@ -0,0 +1,55 @@ +// 语法生成器 +#ifndef GRAMMAR_H +#define GRAMMAR_H + + +#include +#include +#include +#include +#include + +using namespace std; + +class Grammar +{ +public: + const string grammar_file = "./tests/grammar.txt"; + + Grammar(); + ~Grammar(); + void read_grammar(); // 读取语法规则 + void print_grammar(); // 打印语法规则 + void expand_grammar(); // 拓展语法规则 + void init_grammar_set(); // 初始化语法相关集合 + void print_grammar_set(); // 打印语法相关集合 + void get_token_strings(vector &); // 获取 token_stirngs + void print_token_strings(); + +protected: + vector>> grammar_rules; // 产生式规则 + string start; // 起始字符 + vector symbols; // 符号 + vector VTs; // 终结符 + vector VNs; // 非终结符 + unordered_map> first; // FIRST 集 + unordered_map> follow; // FOLLOW 集 + unordered_map infer_empty; // 是否可以推导出 $ 空字符 + vector token_strings; + + +private: + unordered_map> left_appears; // 该符号出现在哪些产生式左侧 + unordered_map> right_appears; // 该符号出现在哪些产生式右侧 + unordered_map> depend; // FOLLOW 集的依赖关系 + + + void init_appears_depend(); // 获取 appear depend 集合 + bool symbol_infer_empty(const string& symbol); // 判断符号是否可以推导出 $ 空字符 + vector symbol_infer_first(const string& symbol);// 推导符号的 FIRST 集 + vector symbol_infer_follow(const string& symbol);// 推导符号的 FOLLOW 集 + +}; + + +#endif // !GRAMMAR_H \ No newline at end of file diff --git a/main/main.cpp b/main/main.cpp new file mode 100644 index 0000000..c3ed26d --- /dev/null +++ b/main/main.cpp @@ -0,0 +1,80 @@ +#include +#include +#include + +#include "nfa.h" +#include "grammar.h" +#include "LL1.h" +using namespace std; + +int main(int argc, char** argv) { + + NFA nfa = RexToNFA(); + printNFA(nfa); + + DFA dfa = nfaToDFA(nfa); + //printDFA(dfa); + DFA minimizedDFA = minimizeDFA(minimizeDFA(dfa)); + removeUnreachableStates(minimizedDFA); + //printDFA(minimizedDFA); + + string inputs[6] = { + "tests/00/00.txt", + "tests/01/01.txt", + "tests/02/02.txt", + "tests/07/07.txt", + "tests/08_err/08.txt", + "tests/10_err/10.txt" + }; + + string outputs_lexical[6] = { + "tests/00/00_my_lexical.txt", + "tests/01/01_my_lexical.txt", + "tests/02/02_my_lexical.txt", + "tests/07/07_my_lexical.txt", + "tests/08_err/08_my_lexical.txt", + "tests/10_err/10_my_lexical.txt" + }; + + string outputs_grammar[6] = { + "tests/00/00_my_grammar.txt", + "tests/01/01_my_grammar.txt", + "tests/02/02_my_grammar.txt", + "tests/07/07_my_grammar.txt", + "tests/08_err/08_my_grammar.txt", + "tests/10_err/10_my_grammar.txt" + }; + + + int i = 0; + for (auto input : inputs) { + LL1 ll; + //ll.print_grammar_set(); + + string content = readfile(input); + vector token_strings = recognize(minimizedDFA, content,outputs_lexical[i]); + + bool flag = ll.IsLL1(); + ll.build_LL1_predict(); + + + // ll.print_LL1_predict(); + ll.get_token_strings(token_strings); + + + // ll.print_token_strings(); + ll.build_LL1_grammar(); + + + ll.fileout_LL1_grammar_log(outputs_grammar[i]); + + + // ll.print_LL1_grammar_log(); + cout << endl; + i++; + } + + + + return 0; +} \ No newline at end of file diff --git a/main/nfa.h b/main/nfa.h new file mode 100644 index 0000000..184b0cf --- /dev/null +++ b/main/nfa.h @@ -0,0 +1,173 @@ +#pragma once +#ifndef __NFA__H__ +#define __NFA__H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +//单词符号的类型,返回<待测代码中的单词符号,WordType> +typedef enum WordType { + //当识别成标识符后,先判断是不是保留字,让后再判断IDN + KW_INT = 0, // int + KW_VOID, // void + KW_RETURN, // return + KW_CONST, // const + + OP_ADD, // + + OP_SUB, // - + OP_MUL, // * + OP_DIV, // / + OP_MOD, // % + OP_ASSIGN, // = + OP_GT, // > + OP_LT, // < + OP_EQ, // == + OP_LE, // <= + OP_GE, // >= + OP_NE, // != + OP_AND, // && + OP_OR, // || + + SE_LBRAC, // ( left backet + SE_RBRAC, // ) right bracket + SE_LCBRAC, // { left curly bracket + SE_RCBRAC, // } right curly bracket + SE_COMMA, // , + SE_SEMI, // ; + + IDN, // [a-zA-Z][a-zA-Z_0-9]* + INT_VAL, // -*[0-9]+ + UNKOWN +}WordType; +string getWordTypeName(WordType type); +//定义输入的字符类别 +typedef enum InputCharType { + LETTER = 0, // 字母 0 + UNDERLINE, // _ 1 + DIGIT, // 数字 2 当识别成功一个数字时,为了避免出现数字01的情况,返回前先进行一个判断,对GCC,01可以识别并等于1的 + //OP + ADD, // + 3 + SUB, // - 4 + MUL, // * 5 + DIV, // / 6 + MOD, // % 7 + EQ, // = 8 + GT, // > 9 + LT, // < 10 + NOT, // ! 11 + AND, // & 12 + OR, // | 13 + //SE + LBRACKET, // ( 14 + RBRACKET, // ) 15 + LCBRAC, // { 16 + RCBRAC, // } 17 + COMMA, // , 18 + SEMI, // ; 19 + + EPSILON, // 空字符 20 +}InputCharType; +string getInputChartypeName(InputCharType type); +enum class TokenType { + KW = 0, + OP, + SE, + IDN, + INT, + UNKNOWN +}; +TokenType getTokenType(WordType wordType,string buffer); +typedef struct Token { + string value; + TokenType type; +} Token; + +//定义函数判断输入的字符类别 +InputCharType getInputCharType(char c); +string getWordTypeName(WordType type,string buffer); + +//定义状态类 +class State { +public: + int id; // 状态编号 + map> transitions; // 转移函数映射表,记录每个输入字符类型对应的目标状态集合 + bool isFinalState; // 是否为最终状态 + WordType wordType; // 到达该状态时应该返回的词法单元类型 + State(int id) : id(id), isFinalState(false), wordType(UNKOWN) {} + void addTransition(InputCharType input, State* targetState) { + transitions[input].insert(targetState); + } + void setFinalState(bool isFinal, WordType type) { + isFinalState = isFinal; + wordType = type; + } + bool operator<(const State& other) const { + return id < other.id; + } +}; +//为了是set内部有序,定义排序结构体StatePtrCompare +struct StatePtrCompare { + bool operator()(const State* lhs, const State* rhs) const { + return lhs->id < rhs->id; + } +}; + +//定义NFA类 +class NFA { +public: + State* startState; // 起始状态 + set endStates; // 终止状态集合 + set states; // 状态集合 + NFA(State* startState, set endStates, set states) : + startState(startState), endStates(endStates), states(states) {} + // void printNFA(); +}; +NFA RexToNFA(); +void printNFA(const NFA& nfa); +NFA buildNFA(string filename); +NFA RexToNFA(); +set move(const set& states, InputCharType input); +set epsilonClosure(const set& states); + +class DFA { +public: + State* startState; // 起始状态 + set endStates; // 终止状态集合 + set states; // 状态集合 + DFA(State* startState, set endStates, set states) : + startState(startState), endStates(endStates), states(states) {} +}; +void removeUnreachableStates(DFA& dfa); +void printDFA(const DFA& dfa); +DFA nfaToDFA(const NFA& nfa); +void printDFA(const DFA& dfa); +struct SetComparator { + bool operator()(const set& a, const set& b) const { + if (a.size() != b.size()) { + return a.size() < b.size(); + } + + vector vecA(a.begin(), a.end()); + vector vecB(b.begin(), b.end()); + + sort(vecA.begin(), vecA.end(), [](const State* a, const State* b) { return a->id < b->id; }); + sort(vecB.begin(), vecB.end(), [](const State* a, const State* b) { return a->id < b->id; }); + + return vecA < vecB; + } +}; +string getGrammarName(WordType type, string buffer); +DFA minimizeDFA(const DFA& dfa); +vector recognize(const DFA& dfa, const string& input, const string& output); +string readfile(const string& filename); +#endif \ No newline at end of file diff --git a/nfa/CMakeLists.txt b/nfa/CMakeLists.txt new file mode 100644 index 0000000..cfc094e --- /dev/null +++ b/nfa/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.10) +project(nfa) + +# 收集所有的cpp源文件 +file(GLOB SOURCES "*.cpp") + +# 设置输出目录为 bin +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +# 创建静态链接库 +add_library(nfa STATIC ${SOURCES}) + +# 添加头文件目录 +target_include_directories(nfa PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/nfa/dfa.cpp b/nfa/dfa.cpp new file mode 100644 index 0000000..8b5009e --- /dev/null +++ b/nfa/dfa.cpp @@ -0,0 +1,251 @@ +#include "nfa.h" + +class Partition { +public: + set states; + Partition(set states) : states(states) {} +}; +/* +最小化算法步骤: +首先把所有节点分为N和A两个集合,集非结束态和结束态 +S = {N,A},然后遍历所有字符,去看每个字符都否对S中的状态集进行划分,每轮遍历下来,如果S仍然在扩大,则从头再来一轮。直到S不再扩大,即没有状态集可分为止。 +c can split s这里s指的是S中的一个状态集 +1.遍历s中每个状态,记录每个状态吃了字符c之后到达的状态,吃不了的不管。 +2.把到达的状态分类,分类依据:把属于同一个状态集的合在一起。这里的同一个状态集指的是S中现在有的状态集。 +3.按照第二步的分法把s分割。 +注意:是从s中分割出去,s最后保留下来的是吃了字符c还在状态集s中的状态或者吃不了c字符的状态。 +*/ + +// split 函数用于将给定的状态集合(group)根据转移函数进一步细分。 +// group: 要细分的状态集合 +// input: 当前考虑的输入字符类型 +// partitions: 存储所有分区的集合,如果需要细分,将在该集合中添加新分区 +void split(const set& group, InputCharType input, set& partitions) { + // 用于存储每个目标分区与对应新分组状态集合的映射 + map> targetPartitionsMap; + + for (State* state : group) { + auto it = state->transitions.find(input); + if (it != state->transitions.end()) { + State* targetState = *(it->second.begin());//DFA状态转移具有唯一性 + // 在当前所有分区中查找包含目标状态的分区 + for (Partition* partition : partitions) { + if (partition->states.find(targetState) != partition->states.end()) { + // 在映射表中将当前状态添加到对应的目标分区 + targetPartitionsMap[partition].insert(state); + break; + } + } + } + } + // 经过上述操作,将在group里的状态根据到达目标Partiset分到不同set + // 遍历目标分区映射表,检查是否需要进一步细分,即将经过input输入状态转换后处于不同目标分区的集合内部拆分开 + for (auto& entry : targetPartitionsMap) { + Partition* targetPartition = entry.first; + //到达该targetPartition的group部分状态合集如下: + set& newGroupStates = entry.second; + //等于的情况不拆分,不会出现大于的情况,将targetPartition拆分开来,也可以将到达不同割集的源状态分割开来,也可以分割目标状态,总之是状态转移结果在现存割集即可 + if (newGroupStates.size() < targetPartition->states.size()) { + for (State* state : newGroupStates) { + targetPartition->states.erase(state); + } + Partition* newGroup = new Partition(newGroupStates); + partitions.insert(newGroup); + } + } +} + +DFA minimizeDFA(const DFA& dfa) { + set partitions; + + // 将所有非终止状态分成一组,将所有终止状态按照 WordType 分组 + /* + * 不同 WordType 的终止状态表示的是不同的词法单元类型。 + * 这些状态在词法分析过程中具有不同的语义,不能被合并为同一个状态。 + */ + map> endStateGroups; //初始终态集合 + set nonEndStates; //初始非终态集合 + for (State* state : dfa.states) { + if (state->isFinalState) { + endStateGroups[state->wordType].insert(state);//使用wordType对终态集合进一步拆分 + } + else { + nonEndStates.insert(state); + } + } + //构造初始分割,是对{N,A}中A的扩展(即终态,加快算法速度,扩展原因见上) + for (auto& entry : endStateGroups) { + Partition* endStateGroup = new Partition(entry.second); + partitions.insert(endStateGroup); + } + Partition* nonEndStateGroup = new Partition(nonEndStates); + partitions.insert(nonEndStateGroup); + //对现有分隔进行再分隔,以获得最小化分割 + size_t oldSize;//分割集初始大小 + do { + oldSize = partitions.size(); + for (InputCharType input = static_cast(0); input < EPSILON; input = static_cast(input + 1)) {//类似于求Ia,Ib等 + for (Partition* partition : set(partitions)) {//遍历现存分割的每一个割集,看是否可再分割 + if (partition->states.size() > 1) {//为1的集合不可再分割 + split(partition->states, input, partitions);//核心分割函数 + } + } + } + } while (partitions.size() != oldSize);//当割集集合大小不再变化时停止 + + // 创建新的最小化 DFA,即重新映射dfa,重新编号状态 + // 构造DFA参数为DFA(State* set setset set states) + set minimizedStates; + set minimizedEndStates; + State* minimizedStartState = nullptr; + map stateMap; + + for (Partition* partition : partitions) {//遍历获得的每个割集 + State* newState = new State(minimizedStates.size());//编号 + // 检查当前划分是否包含旧DFA的开始状态,如果是,则将新状态设置为最小化DFA的开始状态 + if (partition->states.find(dfa.startState) != partition->states.end()) { + minimizedStartState = newState; + } + // 如果划分的状态集合不为空,选择一个代表状态 + if (!partition->states.empty()) { + State* representative = *(partition->states.begin());//因为在前面终止状态都分到了不同割集,且大小为1,所以如果是终止状态begin已经可以代表了 + //在分割状态集合的过程中,已经确保了一个划分中所有状态具有相同的属性,要么所有状态都是终止状态,要么都不是终止状态。所以我们只需要检查一个状态来确定新状态是否应该是终止状态。 + // 如果代表状态是终止状态,则设置新状态为终止状态,并保留相应的单词类型 + if (representative->isFinalState) { + newState->setFinalState(true, representative->wordType); + minimizedEndStates.insert(newState); + } + } + // 将集合里面所有旧状态映射到同一个新状态 + for (State* state : partition->states) + { + stateMap[state] = newState; + } + // 将新状态插入到最小化DFA的状态集合中 + minimizedStates.insert(newState); + } + // 遍历旧DFA中的所有状态 + for (State* oldState : dfa.states) { + // 通过映射找到与旧状态对应的新状态 + State* newState = stateMap[oldState]; + for (const auto& transition : oldState->transitions) { + InputCharType input = transition.first; + State* oldTargetState = *(transition.second.begin());//dfa每个状态只有一个转移状态,沿用了nfa的结构,所以集合大小<=1 + State* newTargetState = stateMap[oldTargetState];// 获取旧状态的目标状态 + newState->addTransition(input, newTargetState);// 通过映射找到新的目标状态 + } + } + + // 清理并删除原始分区 + for (Partition* partition : partitions) { + delete partition; + } + return DFA(minimizedStartState, minimizedEndStates, minimizedStates); +} +void removeUnreachableStates(DFA& dfa) { + set reachableStates; //可达状态集合 + queue statesQueue; //状态队列 + + //将初始状态加入可达状态集合和队列 + reachableStates.insert(dfa.startState); + statesQueue.push(dfa.startState); + + // BFS 遍历 DFA,找出所有可达状态 + while (!statesQueue.empty()) { + State* currentState = statesQueue.front(); + statesQueue.pop(); + for (const auto& transition : currentState->transitions) { + State* targetState = *(transition.second.begin());//dfa每个状态只有一个转移状态,沿用了nfa的结构,所以集合大小<=1 + if (reachableStates.find(targetState) == reachableStates.end()) {//若未访问 + reachableStates.insert(targetState); + statesQueue.push(targetState); + } + } + } + + // 删除所有不可达状态 + for (auto it = dfa.states.begin(); it != dfa.states.end();) { + State* state = *it; + if (reachableStates.find(state) == reachableStates.end()) {//若当前状态不可达,删除 + it = dfa.states.erase(it); + delete state; + } + else { + ++it; + } + } +} +vector recognize(const DFA& dfa, const string& input, const string& output) { + + State* currentState = dfa.startState; + State* nextState = nullptr; + string buffer; + vector tokens; // 用于收集识别到的Token + //打开结果输出文件 + ofstream file(output); + if (!file.is_open()) { + + cout << "Error opening file!" << endl; + return tokens; + } + for (size_t i = 0; i < input.length(); ++i) { + char c = input[i]; + if (c == ' '||c=='\n'||c=='\r\n'||c==' ')// 如果是空格、换行等分隔符,则跳过 + {continue; } + InputCharType inputCharType = getInputCharType(c); + auto it = currentState->transitions.find(inputCharType); + + if (it != currentState->transitions.end()) { + nextState = *(it->second.begin()); + buffer.push_back(c); + + if (nextState->isFinalState && i + 1 < input.length()) {// 如果下一个状态是终止状态并且还有剩余字符 + char nextChar = input[i + 1]; + InputCharType nextInputCharType = getInputCharType(nextChar); + auto nextIt = nextState->transitions.find(nextInputCharType);// 查找下一个状态的转换表中是否有对应的输入字符类型 + + if (nextIt == nextState->transitions.end()) {// 如果没有更多匹配的转换 + // 输出识别到的单词符号和对应的类型 + cout << buffer << "\t<" << getWordTypeName(nextState->wordType,buffer) << ">" << endl; + file << buffer << "\t<" << getWordTypeName(nextState->wordType, buffer) << ">" << endl; + tokens.push_back(getGrammarName(nextState->wordType, buffer)); + buffer.clear(); + currentState = dfa.startState; + } + else { + currentState = nextState;// 更新当前状态为下一个状态 + } + } + else { + currentState = nextState;// 更新当前状态为下一个状态 + } + } + else {// 如果没有找到匹配的转换 + if (currentState->isFinalState) {// 如果当前状态是终止状态 + // 输出识别到的单词符号和对应的类型 + cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) << ">" << endl; + file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ">" << endl; + tokens.push_back(getGrammarName(currentState->wordType, buffer) ); + buffer.clear(); + } + else { + // 如果当前状态不是终止状态 + // 输出无法识别的字符信息 + cout << "Unrecognized characters: " << c << endl; + file << "Unrecognized characters: " << c << endl; + + buffer.clear(); + } + currentState = dfa.startState;// 回到起始状态 + //--i;// 重新处理当前字符,还是跳过吧,这里可以添加错误处理 + } + } + // 处理最后一个字符,如果缓冲区不为空且当前状态是终止状态,对应第一个if里面的else + if (!buffer.empty() && currentState->isFinalState) { + cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) << ">" << endl; + file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ">" << endl; + tokens.push_back(getGrammarName(currentState->wordType, buffer)); + } + file.close();//关闭文件 + return tokens; +} \ No newline at end of file diff --git a/nfa/nfa.cpp b/nfa/nfa.cpp new file mode 100644 index 0000000..cc3e3ca --- /dev/null +++ b/nfa/nfa.cpp @@ -0,0 +1,262 @@ +// 将正则表达式转换为非确定性有限自动机 + + + + +#include "nfa.h" + + + +// 处理正则表达式,描述终态 +NFA RexToNFA() { + //由于里面存在||,所以不同正则间使用空格分隔代表| l代表letter,_代表下划线,0代表数字(也可以是d,但是为了使用已经有的函数), + //[lu]代表l|u + string rex = "+ - * / % = > < == <= >= != && || ( ) { } , ; [l_][l_0]* -?00*"; + //下面给出正则对应的输出(终态) + vector finalState = { + OP_ADD, OP_SUB,OP_MUL,OP_DIV,OP_MOD,OP_ASSIGN,OP_GT,OP_LT, OP_EQ,OP_LE,OP_GE,OP_NE, OP_AND, OP_OR,SE_LBRAC, SE_RBRAC, + SE_LCBRAC,SE_RCBRAC,SE_COMMA,SE_SEMI,IDN,INT_VAL + }; + stringstream ss(rex); + string target; + + // 创建初始状态 + int stateIndex = 0; + int finalIndex = 0; + State* startState = new State(stateIndex++); + set endStates; + set allStates = { startState }; + while (getline(ss, target,' ')) { + //如获得[l_][l_0]* + State* currentState = startState; + + for (size_t i = 0; i < target.length();i++) { + //创建一个新状态,startState通过输入InputCharType到达该状态 + State* newState = new State(stateIndex++); + allStates.insert(newState); + //需要往后看一个符号 + if (target[i] == '[') { + //[...]构成一种输入,查看]后面是否有?或者*,来判断当前状态的构成 + for (i=i+1; i < target.length() && target[i] != ']'; i++) { + InputCharType input = getInputCharType(target[i]); + if (input != EPSILON) { + // 添加转移函数,从当前状态向新状态转移 + currentState->addTransition(input, newState); + } + } + } + else { + InputCharType input = getInputCharType(target[i]); + currentState->addTransition(input, newState); + } + //往后查看一个输入 + if (i + 1 < target.length() && target[i + 1] == '?') { + //创建EPSILON转移状态 + State* epsState = new State(stateIndex++); + allStates.insert(epsState); + currentState->addTransition(EPSILON, epsState); + newState->addTransition(EPSILON, epsState); + currentState = epsState; + // 跳过'?'字符 + i++; + } + else if (i + 1 < target.length() && target[i + 1] == '*') { + State* epsState = new State(stateIndex++); + allStates.insert(epsState); + currentState->addTransition(EPSILON, epsState); + newState->addTransition(EPSILON, epsState); + epsState->addTransition(EPSILON, currentState); + currentState = epsState; + // 跳过'*'字符 + i++; + } + else { + currentState = newState; + } + //判断是否是终止状态 + if (i == (target.length() - 1)) { + // 到达最后一个字符,将当前状态设置为终止状态 + currentState->setFinalState(true, finalState[endStates.size()]); + endStates.insert(currentState); + } + }//for + } + // 返回字符集合对应的NFA + return NFA(startState, endStates, allStates); +} + +// 构造状态机 +NFA buildNFA(string filename) { + ifstream ifs(filename); + if (!ifs) { + cerr << "Cannot open file: " << filename << endl; + exit(EXIT_FAILURE); + } + + int stateNum, inputNum; + ifs >> stateNum >> inputNum; + + vector states(stateNum); + for (int i = 0; i < stateNum; i++) { + states[i] = new State(i); + } + + State* startState = states[0]; + set endStates; + for (int i = 0; i < stateNum; i++) { + for (int j = 0; j < inputNum; j++) { + string targetStateIDs; + ifs >> targetStateIDs; + if (targetStateIDs.compare("#") != 0) { + stringstream ss(targetStateIDs); + string targetStateIDStr; + while (getline(ss, targetStateIDStr, ',')) { + int targetStateID = stoi(targetStateIDStr); + states[i]->addTransition(static_cast(j), states[targetStateID]); + } + } + } + } + + int endStateNum; + ifs >> endStateNum; + for (int i = 0; i < endStateNum; i++) { + int endStateID, wordTypeID; + ifs >> endStateID >> wordTypeID; + states[endStateID]->setFinalState(true, static_cast(wordTypeID)); + endStates.insert(states[endStateID]); + } + + return NFA(startState, endStates, set(states.begin(), states.end())); +} + +void printNFA(const NFA& nfa) { + cout << "Start state: " << nfa.startState->id << endl; + cout << "End states: "<id << " " << getWordTypeName(state->wordType) << " " << (state->isFinalState == true) << endl; + } + cout << endl; + + cout << "States and transitions:" << endl; + for (auto state : nfa.states) { + cout << "State " << state->id << ":" << endl; + for (auto transition : state->transitions) { + cout << "\tInput " << getInputChartypeName(transition.first) << ": "; + for (auto targetState : transition.second) { + cout << targetState->id << " "; + } + cout << endl; + } + } +} + +set move(const set& states, InputCharType input) { + set targetStates; + for (State* state : states) { + auto it = state->transitions.find(input); + if (it != state->transitions.end()) { + for (State* targetState : it->second) { + if (targetStates.find(targetState) == targetStates.end()) { + targetStates.insert(targetState); + } + } + } + } + return targetStates; +} + + +set epsilonClosure(const set& states) { + set closure = states; + stack stateStack; + for (State* state : states) { + stateStack.push(state); + } + while (!stateStack.empty()) { + State* currentState = stateStack.top(); + stateStack.pop(); + auto it = currentState->transitions.find(EPSILON); + if (it != currentState->transitions.end()) { + for (State* nextState : it->second) { + if (closure.find(nextState) == closure.end()) {//防止同一状态多次进栈,set自带去重 + closure.insert(nextState); + stateStack.push(nextState); + } + } + } + } + return closure; +} + +DFA nfaToDFA(const NFA& nfa) { + map, State*, SetComparator> dfaStatesMap; // 用于映射NFA状态集合到DFA状态的映射表 + queue> nfaStatesQueue; // 用于BFS遍历的集合队列 + set dfaStates; + set dfaEndStates; + + set nfaStartClosure = epsilonClosure({ nfa.startState }); + State* dfaStartState = new State(0); + dfaStatesMap[nfaStartClosure] = dfaStartState; + dfaStates.insert(dfaStartState); + nfaStatesQueue.push(nfaStartClosure); + + int nextStateId = 1; + //set nfaStartClosure + while (!nfaStatesQueue.empty()) { + set currentNFAStates = nfaStatesQueue.front(); + nfaStatesQueue.pop(); + State* currentDFAState = dfaStatesMap[currentNFAStates]; + + // 检查是否有终止状态,如果有,设置DFA状态为终止状态 + for (State* nfaState : currentNFAStates) { + if (nfaState->isFinalState) { + // cout << nfaState->id << "is FinalState" << endl; + currentDFAState->setFinalState(true, nfaState->wordType); + dfaEndStates.insert(currentDFAState); + break; + } + } + + // 遍历所有输入字符类型 + for (int i = 0; i < static_cast(EPSILON); i++) { + InputCharType inputCharType = static_cast(i); + set nextNFAStates = epsilonClosure(move(currentNFAStates, inputCharType)); + if (nextNFAStates.empty()) { + continue; + } + + // 如果NFA状态集合不存在于映射表中,则创建新的DFA状态 + if (dfaStatesMap.find(nextNFAStates) == dfaStatesMap.end()) { + State* newDFAState = new State(nextStateId++); + dfaStatesMap[nextNFAStates] = newDFAState; + dfaStates.insert(newDFAState); + nfaStatesQueue.push(nextNFAStates); + } + currentDFAState->addTransition(inputCharType, dfaStatesMap[nextNFAStates]); + } + } + + return DFA(dfaStartState, dfaEndStates, dfaStates); +} + +void printDFA(const DFA& dfa) { + cout << "Start state: " << dfa.startState->id << endl; + cout << "End states: "<id << " " << getWordTypeName(state->wordType) << endl; + } + cout << endl; + cout << "States and transitions:" << endl; + for (auto state : dfa.states) { + cout << "State " << state->id << ":" << endl; + for (auto transition : state->transitions) { + cout << "\tInput " << getInputChartypeName(transition.first) << ": "; + for (auto targetState : transition.second) { + cout << targetState->id << " "; + } + cout << endl; + } + } +} + diff --git a/nfa/nfa.h b/nfa/nfa.h new file mode 100644 index 0000000..184b0cf --- /dev/null +++ b/nfa/nfa.h @@ -0,0 +1,173 @@ +#pragma once +#ifndef __NFA__H__ +#define __NFA__H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; +//单词符号的类型,返回<待测代码中的单词符号,WordType> +typedef enum WordType { + //当识别成标识符后,先判断是不是保留字,让后再判断IDN + KW_INT = 0, // int + KW_VOID, // void + KW_RETURN, // return + KW_CONST, // const + + OP_ADD, // + + OP_SUB, // - + OP_MUL, // * + OP_DIV, // / + OP_MOD, // % + OP_ASSIGN, // = + OP_GT, // > + OP_LT, // < + OP_EQ, // == + OP_LE, // <= + OP_GE, // >= + OP_NE, // != + OP_AND, // && + OP_OR, // || + + SE_LBRAC, // ( left backet + SE_RBRAC, // ) right bracket + SE_LCBRAC, // { left curly bracket + SE_RCBRAC, // } right curly bracket + SE_COMMA, // , + SE_SEMI, // ; + + IDN, // [a-zA-Z][a-zA-Z_0-9]* + INT_VAL, // -*[0-9]+ + UNKOWN +}WordType; +string getWordTypeName(WordType type); +//定义输入的字符类别 +typedef enum InputCharType { + LETTER = 0, // 字母 0 + UNDERLINE, // _ 1 + DIGIT, // 数字 2 当识别成功一个数字时,为了避免出现数字01的情况,返回前先进行一个判断,对GCC,01可以识别并等于1的 + //OP + ADD, // + 3 + SUB, // - 4 + MUL, // * 5 + DIV, // / 6 + MOD, // % 7 + EQ, // = 8 + GT, // > 9 + LT, // < 10 + NOT, // ! 11 + AND, // & 12 + OR, // | 13 + //SE + LBRACKET, // ( 14 + RBRACKET, // ) 15 + LCBRAC, // { 16 + RCBRAC, // } 17 + COMMA, // , 18 + SEMI, // ; 19 + + EPSILON, // 空字符 20 +}InputCharType; +string getInputChartypeName(InputCharType type); +enum class TokenType { + KW = 0, + OP, + SE, + IDN, + INT, + UNKNOWN +}; +TokenType getTokenType(WordType wordType,string buffer); +typedef struct Token { + string value; + TokenType type; +} Token; + +//定义函数判断输入的字符类别 +InputCharType getInputCharType(char c); +string getWordTypeName(WordType type,string buffer); + +//定义状态类 +class State { +public: + int id; // 状态编号 + map> transitions; // 转移函数映射表,记录每个输入字符类型对应的目标状态集合 + bool isFinalState; // 是否为最终状态 + WordType wordType; // 到达该状态时应该返回的词法单元类型 + State(int id) : id(id), isFinalState(false), wordType(UNKOWN) {} + void addTransition(InputCharType input, State* targetState) { + transitions[input].insert(targetState); + } + void setFinalState(bool isFinal, WordType type) { + isFinalState = isFinal; + wordType = type; + } + bool operator<(const State& other) const { + return id < other.id; + } +}; +//为了是set内部有序,定义排序结构体StatePtrCompare +struct StatePtrCompare { + bool operator()(const State* lhs, const State* rhs) const { + return lhs->id < rhs->id; + } +}; + +//定义NFA类 +class NFA { +public: + State* startState; // 起始状态 + set endStates; // 终止状态集合 + set states; // 状态集合 + NFA(State* startState, set endStates, set states) : + startState(startState), endStates(endStates), states(states) {} + // void printNFA(); +}; +NFA RexToNFA(); +void printNFA(const NFA& nfa); +NFA buildNFA(string filename); +NFA RexToNFA(); +set move(const set& states, InputCharType input); +set epsilonClosure(const set& states); + +class DFA { +public: + State* startState; // 起始状态 + set endStates; // 终止状态集合 + set states; // 状态集合 + DFA(State* startState, set endStates, set states) : + startState(startState), endStates(endStates), states(states) {} +}; +void removeUnreachableStates(DFA& dfa); +void printDFA(const DFA& dfa); +DFA nfaToDFA(const NFA& nfa); +void printDFA(const DFA& dfa); +struct SetComparator { + bool operator()(const set& a, const set& b) const { + if (a.size() != b.size()) { + return a.size() < b.size(); + } + + vector vecA(a.begin(), a.end()); + vector vecB(b.begin(), b.end()); + + sort(vecA.begin(), vecA.end(), [](const State* a, const State* b) { return a->id < b->id; }); + sort(vecB.begin(), vecB.end(), [](const State* a, const State* b) { return a->id < b->id; }); + + return vecA < vecB; + } +}; +string getGrammarName(WordType type, string buffer); +DFA minimizeDFA(const DFA& dfa); +vector recognize(const DFA& dfa, const string& input, const string& output); +string readfile(const string& filename); +#endif \ No newline at end of file diff --git a/nfa/tool.cpp b/nfa/tool.cpp new file mode 100644 index 0000000..973e45f --- /dev/null +++ b/nfa/tool.cpp @@ -0,0 +1,287 @@ +#include "nfa.h" + + +InputCharType getInputCharType(char c) { + switch (c) { + case '_': return UNDERLINE; + case '+': return ADD; + case '-': return SUB; + case '*': return MUL; + case '/': return DIV; + case '%': return MOD; + case '=': return EQ; + case '>': return GT; + case '<': return LT; + case '!': return NOT; + case '&': return AND; + case '|': return OR; + case '(': return LBRACKET; + case ')': return RBRACKET; + case '{': return LCBRAC; + case '}': return RCBRAC; + case ',': return COMMA; + case ';': return SEMI; + default: + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { + return LETTER; + } + else if (c >= '0' && c <= '9') { + return DIGIT; + } + else { + return EPSILON; + } + } +} +string getInputChartypeName(InputCharType type) { + switch (type) + { + case LETTER: + return "LETTER"; + case UNDERLINE: + return "UNDERLINE"; + case DIGIT: + return "DIGIT"; + case ADD: + return "+"; + case SUB: + return "-"; + case MUL: + return "*"; + case DIV: + return "/"; + case MOD: + return "%"; + case EQ: + return "="; + case GT: + return ">"; + case LT: + return "<"; + case NOT: + return "!"; + case AND: + return "&"; + case OR: + return "|"; + case LBRACKET: + return "("; + case RBRACKET: + return ")"; + case LCBRAC: + return "{"; + case RCBRAC: + return "}"; + case COMMA: + return ","; + case SEMI: + return ";"; + case EPSILON: + return "EPSILON"; + default: + return "UNKOWN"; + } +} +string getWordTypeName(WordType type, string buffer) { + switch (type) { + case OP_ADD: + case OP_SUB: + case OP_MUL: + case OP_DIV: + case OP_MOD: + case OP_ASSIGN: + case OP_GT: + case OP_LT: + case OP_EQ: + case OP_LE: + case OP_GE: + case OP_NE: + case OP_AND: + case OP_OR: + return "OP"; + + case SE_LBRAC: + case SE_RBRAC: + case SE_LCBRAC: + case SE_RCBRAC: + case SE_COMMA: + case SE_SEMI: + return "SE"; + + case IDN: + if (!buffer.compare("int") || !buffer.compare("void") || !buffer.compare("const") || !buffer.compare("return")){ + return "KW"; + } + else { + return "IDN"; + } + + case INT_VAL: + return "INT"; + + default: + return "UNKNOWN"; + } +} + +string readfile(const string& filename) +{ + // 打开文件流并读取文件内容 + ifstream file(filename); + + string content((istreambuf_iterator(file)), + istreambuf_iterator()); + + // 去掉换行符 + //remove函数的作用是将字符串中的某个字符移动到字符串的末尾,并返回一个指向该字符后面位置的指针。 + //erase 函数的作用是删除字符串中指定区间内的所有字符,返回修改后的字符串 + //content.erase(remove(content.begin(), content.end(), '\n'), content.end()); + + return content; +} +TokenType getTokenType(WordType type,string buffer) { + switch (type) { + case OP_ADD: + case OP_SUB: + case OP_MUL: + case OP_DIV: + case OP_MOD: + case OP_ASSIGN: + case OP_GT: + case OP_LT: + case OP_EQ: + case OP_LE: + case OP_GE: + case OP_NE: + case OP_AND: + case OP_OR: + return TokenType::OP; + + case SE_LBRAC: + case SE_RBRAC: + case SE_LCBRAC: + case SE_RCBRAC: + case SE_COMMA: + case SE_SEMI: + return TokenType::SE; + + case IDN: + if (!buffer.compare("int") || !buffer.compare("void") || !buffer.compare("const") || !buffer.compare("return")) { + return TokenType::KW; + } + else { + return TokenType::IDN; + } + + case INT_VAL: + return TokenType::INT; + + default: + return TokenType::UNKNOWN; + } +} + +string getWordTypeName(WordType type) { + switch (type) { + case KW_INT: + return "KW_INT"; + case KW_VOID: + return "KW_VOID"; + case KW_RETURN: + return "KW_RETURN"; + case KW_CONST: + return "KW_CONST"; + case OP_ADD: + return "OP_ADD"; + case OP_SUB: + return "OP_SUB"; + case OP_MUL: + return "OP_MUL"; + case OP_DIV: + return "OP_DIV"; + case OP_MOD: + return "OP_MOD"; + case OP_ASSIGN: + return "OP_ASSIGN"; + case OP_GT: + return "OP_GT"; + case OP_LT: + return "OP_LT"; + case OP_EQ: + return "OP_EQ"; + case OP_LE: + return "OP_LE"; + case OP_GE: + return "OP_GE"; + case OP_NE: + return "OP_NE"; + case OP_AND: + return "OP_AND"; + case OP_OR: + return "OP_OR"; + case SE_LBRAC: + return "SE_LBRAC"; + case SE_RBRAC: + return "SE_RBRAC"; + case SE_LCBRAC: + return "SE_LCBRAC"; + case SE_RCBRAC: + return "SE_RCBRAC"; + case SE_COMMA: + return "SE_COMMA"; + case SE_SEMI: + return "SE_SEMI"; + case IDN: + return "IDN"; + case INT_VAL: + return "INT_VAL"; + default: + return "UNKNOWN"; + } +} + +string getGrammarName(WordType type, string buffer) { + switch (type) { + + case OP_ADD: return "+"; + case OP_SUB: return "-"; + case OP_MUL: return "*"; + case OP_DIV: return "/"; + case OP_MOD: return "%"; + case OP_ASSIGN: return "="; + case OP_GT: return ">"; + case OP_LT: return "<"; + case OP_EQ: return "=="; + case OP_LE: return "<="; + case OP_GE: return ">="; + case OP_NE: return "!="; + case OP_AND: return "&&"; + case OP_OR: return "||"; + + case SE_LBRAC: return "("; + case SE_RBRAC: return ")"; + case SE_LCBRAC: return "{"; + case SE_RCBRAC: return "}"; + case SE_COMMA: return ","; + case SE_SEMI: return ";"; + + case IDN: + if (!buffer.compare("int")) { + return "int"; + } + else if (!buffer.compare("void")) { + return "void"; + } + else if (!buffer.compare("return")) { + return "return"; + } + else if (!buffer.compare("const")) { + return "const"; + } + else { + return "IDN"; + } + case INT_VAL: return "INT"; + default: cerr << "Token Error: "<< type << endl; exit(-1); + } +} \ No newline at end of file