253 lines
13 KiB
C++
253 lines
13 KiB
C++
#include "nfa.h"
|
||
|
||
class Partition {
|
||
public:
|
||
set<State*, StatePtrCompare> states;
|
||
Partition(set<State*, StatePtrCompare> states) : states(states) {}
|
||
};
|
||
/*
|
||
最小化算法步骤:
|
||
首先把所有节点分为N和A两个集合,集非结束态和结束态
|
||
S = {N,A},然后遍历所有字符,去看每个字符都否对S中的状态集进行划分,每轮遍历下来,如果S仍然在扩大,则从头再来一轮。直到S不再扩大,即没有状态集可分为止。
|
||
c can split s这里s指的是S中的一个状态集
|
||
1.遍历s中每个状态,记录每个状态吃了字符c之后到达的状态,吃不了的不管。
|
||
2.把到达的状态分类,分类依据:把属于同一个状态集的合在一起。这里的同一个状态集指的是S中现在有的状态集。
|
||
3.按照第二步的分法把s分割。
|
||
注意:是从s中分割出去,s最后保留下来的是吃了字符c还在状态集s中的状态或者吃不了c字符的状态。
|
||
*/
|
||
|
||
// split 函数用于将给定的状态集合(group)根据转移函数进一步细分。
|
||
// group: 要细分的状态集合
|
||
// input: 当前考虑的输入字符类型
|
||
// partitions: 存储所有分区的集合,如果需要细分,将在该集合中添加新分区
|
||
void split(const set<State*, StatePtrCompare>& group, InputCharType input, set<Partition*>& partitions) {
|
||
// 用于存储每个目标分区与对应新分组状态集合的映射
|
||
map<Partition*, set<State*, StatePtrCompare>> targetPartitionsMap;
|
||
|
||
for (State* state : group) {
|
||
auto it = state->transitions.find(input);
|
||
if (it != state->transitions.end()) {
|
||
State* targetState = *(it->second.begin());//DFA状态转移具有唯一性
|
||
// 在当前所有分区中查找包含目标状态的分区
|
||
for (Partition* partition : partitions) {
|
||
if (partition->states.find(targetState) != partition->states.end()) {
|
||
// 在映射表中将当前状态添加到对应的目标分区
|
||
targetPartitionsMap[partition].insert(state);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
// 经过上述操作,将在group里的状态根据到达目标Partiset<State*, StatePtrCompare>分到不同set<State*>
|
||
// 遍历目标分区映射表,检查是否需要进一步细分,即将经过input输入状态转换后处于不同目标分区的集合内部拆分开
|
||
for (auto& entry : targetPartitionsMap) {
|
||
Partition* targetPartition = entry.first;
|
||
//到达该targetPartition的group部分状态合集如下:
|
||
set<State*, StatePtrCompare>& newGroupStates = entry.second;
|
||
//等于的情况不拆分,不会出现大于的情况,将targetPartition拆分开来,也可以将到达不同割集的源状态分割开来,也可以分割目标状态,总之是状态转移结果在现存割集即可
|
||
if (newGroupStates.size() < targetPartition->states.size()) {
|
||
for (State* state : newGroupStates) {
|
||
targetPartition->states.erase(state);
|
||
}
|
||
Partition* newGroup = new Partition(newGroupStates);
|
||
partitions.insert(newGroup);
|
||
}
|
||
}
|
||
}
|
||
|
||
DFA minimizeDFA(const DFA& dfa) {
|
||
set<Partition*> partitions;
|
||
|
||
// 将所有非终止状态分成一组,将所有终止状态按照 WordType 分组
|
||
/*
|
||
* 不同 WordType 的终止状态表示的是不同的词法单元类型。
|
||
* 这些状态在词法分析过程中具有不同的语义,不能被合并为同一个状态。
|
||
*/
|
||
map<WordType, set<State*, StatePtrCompare>> endStateGroups; //初始终态集合
|
||
set<State*, StatePtrCompare> nonEndStates; //初始非终态集合
|
||
for (State* state : dfa.states) {
|
||
if (state->isFinalState) {
|
||
endStateGroups[state->wordType].insert(state);//使用wordType对终态集合进一步拆分
|
||
}
|
||
else {
|
||
nonEndStates.insert(state);
|
||
}
|
||
}
|
||
//构造初始分割,是对{N,A}中A的扩展(即终态,加快算法速度,扩展原因见上)
|
||
for (auto& entry : endStateGroups) {
|
||
Partition* endStateGroup = new Partition(entry.second);
|
||
partitions.insert(endStateGroup);
|
||
}
|
||
Partition* nonEndStateGroup = new Partition(nonEndStates);
|
||
partitions.insert(nonEndStateGroup);
|
||
//对现有分隔进行再分隔,以获得最小化分割
|
||
size_t oldSize;//分割集初始大小
|
||
do {
|
||
oldSize = partitions.size();
|
||
for (InputCharType input = static_cast<InputCharType>(0); input < EPSILON; input = static_cast<InputCharType>(input + 1)) {//类似于求Ia,Ib等
|
||
for (Partition* partition : set<Partition*>(partitions)) {//遍历现存分割的每一个割集,看是否可再分割
|
||
if (partition->states.size() > 1) {//为1的集合不可再分割
|
||
split(partition->states, input, partitions);//核心分割函数
|
||
}
|
||
}
|
||
}
|
||
} while (partitions.size() != oldSize);//当割集集合大小不再变化时停止
|
||
|
||
// 创建新的最小化 DFA,即重新映射dfa,重新编号状态
|
||
// 构造DFA参数为DFA(State* set<State*, StatePtrCompare> set<State*>set<State*, StatePtrCompare> set<State*> states)
|
||
set<State*, StatePtrCompare> minimizedStates;
|
||
set<State*, StatePtrCompare> minimizedEndStates;
|
||
State* minimizedStartState = nullptr;
|
||
map<State*, State*> stateMap;
|
||
|
||
for (Partition* partition : partitions) {//遍历获得的每个割集
|
||
State* newState = new State(minimizedStates.size());//编号
|
||
// 检查当前划分是否包含旧DFA的开始状态,如果是,则将新状态设置为最小化DFA的开始状态
|
||
if (partition->states.find(dfa.startState) != partition->states.end()) {
|
||
minimizedStartState = newState;
|
||
}
|
||
// 如果划分的状态集合不为空,选择一个代表状态
|
||
if (!partition->states.empty()) {
|
||
State* representative = *(partition->states.begin());//因为在前面终止状态都分到了不同割集,且大小为1,所以如果是终止状态begin已经可以代表了
|
||
//在分割状态集合的过程中,已经确保了一个划分中所有状态具有相同的属性,要么所有状态都是终止状态,要么都不是终止状态。所以我们只需要检查一个状态来确定新状态是否应该是终止状态。
|
||
// 如果代表状态是终止状态,则设置新状态为终止状态,并保留相应的单词类型
|
||
if (representative->isFinalState) {
|
||
newState->setFinalState(true, representative->wordType);
|
||
minimizedEndStates.insert(newState);
|
||
}
|
||
}
|
||
// 将集合里面所有旧状态映射到同一个新状态
|
||
for (State* state : partition->states)
|
||
{
|
||
stateMap[state] = newState;
|
||
}
|
||
// 将新状态插入到最小化DFA的状态集合中
|
||
minimizedStates.insert(newState);
|
||
}
|
||
// 遍历旧DFA中的所有状态
|
||
for (State* oldState : dfa.states) {
|
||
// 通过映射找到与旧状态对应的新状态
|
||
State* newState = stateMap[oldState];
|
||
for (const auto& transition : oldState->transitions) {
|
||
InputCharType input = transition.first;
|
||
State* oldTargetState = *(transition.second.begin());//dfa每个状态只有一个转移状态,沿用了nfa的结构,所以集合大小<=1
|
||
State* newTargetState = stateMap[oldTargetState];// 获取旧状态的目标状态
|
||
newState->addTransition(input, newTargetState);// 通过映射找到新的目标状态
|
||
}
|
||
}
|
||
|
||
// 清理并删除原始分区
|
||
for (Partition* partition : partitions) {
|
||
delete partition;
|
||
}
|
||
return DFA(minimizedStartState, minimizedEndStates, minimizedStates);
|
||
}
|
||
|
||
void removeUnreachableStates(DFA& dfa) {
|
||
set<State*> reachableStates; //可达状态集合
|
||
queue<State*> statesQueue; //状态队列
|
||
|
||
//将初始状态加入可达状态集合和队列
|
||
reachableStates.insert(dfa.startState);
|
||
statesQueue.push(dfa.startState);
|
||
|
||
// BFS 遍历 DFA,找出所有可达状态
|
||
while (!statesQueue.empty()) {
|
||
State* currentState = statesQueue.front();
|
||
statesQueue.pop();
|
||
for (const auto& transition : currentState->transitions) {
|
||
State* targetState = *(transition.second.begin());//dfa每个状态只有一个转移状态,沿用了nfa的结构,所以集合大小<=1
|
||
if (reachableStates.find(targetState) == reachableStates.end()) {//若未访问
|
||
reachableStates.insert(targetState);
|
||
statesQueue.push(targetState);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 删除所有不可达状态
|
||
for (auto it = dfa.states.begin(); it != dfa.states.end();) {
|
||
State* state = *it;
|
||
if (reachableStates.find(state) == reachableStates.end()) {//若当前状态不可达,删除
|
||
it = dfa.states.erase(it);
|
||
delete state;
|
||
}
|
||
else {
|
||
++it;
|
||
}
|
||
}
|
||
}
|
||
|
||
vector<string> recognize(const DFA& dfa, const string& input, const string& output) {
|
||
|
||
State* currentState = dfa.startState;
|
||
State* nextState = nullptr;
|
||
string buffer;
|
||
vector<string> tokens; // 用于收集识别到的Token
|
||
//打开结果输出文件
|
||
ofstream file(output);
|
||
if (!file.is_open()) {
|
||
|
||
cout << "Error opening file!" << endl;
|
||
return tokens;
|
||
}
|
||
for (size_t i = 0; i < input.length(); ++i) {
|
||
char c = input[i];
|
||
if (c == ' '||c=='\n'||c=='\r\n'||c==' ')// 如果是空格、换行等分隔符,则跳过
|
||
{continue; }
|
||
InputCharType inputCharType = getInputCharType(c);
|
||
auto it = currentState->transitions.find(inputCharType);
|
||
|
||
if (it != currentState->transitions.end()) {
|
||
nextState = *(it->second.begin());
|
||
buffer.push_back(c);
|
||
|
||
if (nextState->isFinalState && i + 1 < input.length()) {// 如果下一个状态是终止状态并且还有剩余字符
|
||
char nextChar = input[i + 1];
|
||
InputCharType nextInputCharType = getInputCharType(nextChar);
|
||
auto nextIt = nextState->transitions.find(nextInputCharType);// 查找下一个状态的转换表中是否有对应的输入字符类型
|
||
|
||
if (nextIt == nextState->transitions.end()) {// 如果没有更多匹配的转换
|
||
// 输出识别到的单词符号和对应的类型
|
||
cout << buffer << "\t<" << getWordTypeName(nextState->wordType,buffer)<<","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
file << buffer << "\t<" << getWordTypeName(nextState->wordType, buffer)<<","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
tokens.push_back(getGrammarName(nextState->wordType, buffer));
|
||
buffer.clear();
|
||
currentState = dfa.startState;
|
||
}
|
||
else {
|
||
currentState = nextState;// 更新当前状态为下一个状态
|
||
}
|
||
}
|
||
else {
|
||
currentState = nextState;// 更新当前状态为下一个状态
|
||
}
|
||
}
|
||
else {// 如果没有找到匹配的转换
|
||
if (currentState->isFinalState) {// 如果当前状态是终止状态
|
||
// 输出识别到的单词符号和对应的类型
|
||
cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) <<","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
tokens.push_back(getGrammarName(currentState->wordType, buffer) );
|
||
buffer.clear();
|
||
}
|
||
else {
|
||
// 如果当前状态不是终止状态
|
||
// 输出无法识别的字符信息
|
||
cout << "Unrecognized characters: " << c << endl;
|
||
file << "Unrecognized characters: " << c << endl;
|
||
|
||
buffer.clear();
|
||
}
|
||
currentState = dfa.startState;// 回到起始状态
|
||
//--i;// 重新处理当前字符,还是跳过吧,这里可以添加错误处理
|
||
}
|
||
}
|
||
// 处理最后一个字符,如果缓冲区不为空且当前状态是终止状态,对应第一个if里面的else
|
||
if (!buffer.empty() && currentState->isFinalState) {
|
||
cout << buffer << "\t<" << getWordTypeName(currentState->wordType,buffer) << ","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
file << buffer << "\t<" << getWordTypeName(currentState->wordType, buffer) << ","<<getWordAttribute(nextState->wordType,buffer) << ">" << endl;
|
||
tokens.push_back(getGrammarName(currentState->wordType, buffer));
|
||
}
|
||
file.close();//关闭文件
|
||
return tokens;
|
||
} |