流式清洗文本Tag
代码实现如下
class TagFilter:
def __init__(self):
self.tags = ["<think>", "</think>", "<source>", "</source>", "<tag>", "</tag>"]
self.buffer = "" # 用于缓存可能的半截标签
self.in_tag = False
def process(self, chunk: str) -> str:
output = []
for ch in chunk:
if self.in_tag:
self.buffer += ch
# 判断缓冲内容是否构成已知标签前缀
if any(t.startswith(self.buffer) for t in self.tags):
# 前缀命中,不输出,但继续收集
if self.buffer in self.tags:
# 标签完整结束
self.in_tag = False
self.buffer = ""
continue
else:
# 不是任何标签的前缀,说明之前的 "<" 是正常字符
output.append(self.buffer)
self.buffer = ""
self.in_tag = False
continue
# 未在标签中
if ch == "<":
# 可能是新标签,开始缓存
self.in_tag = True
self.buffer = "<"
continue
output.append(ch)
return "".join(output)
f = TagFilter()
print(f.process("aaa<think>")) # → aaa
print(f.process("aaa<source>bbb")) # → aaabbb
print(f.process("aaa<ta")) # → aaa
转载请注明来源 goldandrabbit.github.io