流式清洗文本 Tag

  1. 流式清洗文本Tag

流式清洗文本Tag

代码实现如下

class TagFilter:
    def __init__(self):
        self.tags = ["<think>", "</think>", "<source>", "</source>", "<tag>", "</tag>"]
        self.buffer = "" # 用于缓存可能的半截标签
        self.in_tag = False
    def process(self, chunk: str) -> str:
        output = []
        for ch in chunk:
            if self.in_tag:
                self.buffer += ch
                # 判断缓冲内容是否构成已知标签前缀
                if any(t.startswith(self.buffer) for t in self.tags):
                    # 前缀命中,不输出,但继续收集
                    if self.buffer in self.tags:
                        # 标签完整结束
                        self.in_tag = False
                        self.buffer = ""
                    continue
                else:
                    # 不是任何标签的前缀,说明之前的 "<" 是正常字符
                    output.append(self.buffer)
                    self.buffer = ""
                    self.in_tag = False
                continue
            # 未在标签中
            if ch == "<":
                # 可能是新标签,开始缓存
                self.in_tag = True
                self.buffer = "<"
                continue
            output.append(ch)
        return "".join(output)

f = TagFilter()
print(f.process("aaa<think>"))          # → aaa
print(f.process("aaa<source>bbb"))      # → aaabbb
print(f.process("aaa<ta"))              # → aaa

转载请注明来源 goldandrabbit.github.io