识别图片中的日文,并将其翻译成中文

LangGraph 实现:

import os
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 定义状态
class AgentState(TypedDict):
    image_path: str
    prompt: str
    extracted_text: Optional[str]
    translated_text: Optional[str]
    output_file: str

# 节点1:OCR提取日文
def ocr_node(state: AgentState) -> AgentState:
    try:
        image = Image.open(state["image_path"])
        text = pytesseract.image_to_string(image, lang='jpn')
        state["extracted_text"] = text.strip()
        return state
    except Exception as e:
        state["extracted_text"] = f"OCR错误: {str(e)}"
        return state

# 节点2:翻译日文到中文
def translate_node(state: AgentState) -> AgentState:
    if not state["extracted_text"] or "错误" in state["extracted_text"]:
        state["translated_text"] = "无法翻译: 无有效文本"
        return state
    
    try:
        # 使用Hugging Face的翻译模型
        model_name = "Helsinki-NLP/opus-mt-ja-zh"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        inputs = tokenizer(state["extracted_text"], return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        state["translated_text"] = translated
        return state
    except Exception as e:
        state["translated_text"] = f"翻译错误: {str(e)}"
        return state

# 节点3:写入文件
def write_to_file_node(state: AgentState) -> AgentState:
    try:
        with open(state["output_file"], "w", encoding="utf-8") as f:
            f.write(state["translated_text"] or "无翻译结果")
        return state
    except Exception as e:
        state["translated_text"] = f"文件写入错误: {str(e)}"
        return state

# 创建工作流
def create_workflow():
    workflow = StateGraph(AgentState)
    
    # 添加节点
    workflow.add_node("ocr", ocr_node)
    workflow.add_node("translate", translate_node)
    workflow.add_node("write_to_file", write_to_file_node)
    
    # 定义流程
    workflow.set_entry_point("ocr")
    workflow.add_edge("ocr", "translate")
    workflow.add_edge("translate", "write_to_file")
    workflow.add_edge("write_to_file", END)
    
    return workflow.compile()

# 主函数
def main(image_path: str, output_file: str = "translated_output.txt"):
    # 初始化状态
    state = AgentState(
        image_path=image_path,
        prompt="识别图片中的日文,并将其翻译成中文",
        extracted_text=None,
        translated_text=None,
        output_file=output_file
    )
    
    # 创建并运行工作流
    app = create_workflow()
    final_state = app.invoke(state)
    
    return final_state

if __name__ == "__main__":
    # 示例用法
    image_path = "path_to_your_image.jpg"
    output_file = "translated_output.txt"
    result = main(image_path, output_file)
    print(f"处理完成!翻译结果已保存至 {output_file}")
    print(f"提取的日文: {result['extracted_text']}")
    print(f"翻译的中文: {result['translated_text']}")

上述需求 LangChain 足够了 ,LangChain 实现:

from langchain_core.tools import tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import os

# 定义工具
@tool
def ocr_image(image_path: str) -> str:
    """从图片中提取日文文本"""
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang='jpn')
        return text.strip()
    except Exception as e:
        return f"OCR错误: {str(e)}"

@tool
def translate_ja_to_zh(text: str) -> str:
    """将日文翻译成中文"""
    if not text or "错误" in text:
        return "无法翻译: 无有效文本"
    
    try:
        model_name = "Helsinki-NLP/opus-mt-ja-zh"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated
    except Exception as e:
        return f"翻译错误: {str(e)}"

@tool
def write_to_file(text: str, output_file: str) -> str:
    """将文本写入文件"""
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(text)
        return f"成功写入文件: {output_file}"
    except Exception as e:
        return f"文件写入错误: {str(e)}"

# 创建提示模板
prompt_template = PromptTemplate(
    input_variables=["input", "agent_scratchpad"],
    template="""
    你是一个能够处理图像中文本的AI助手。你的任务是:
    1. 从图片中提取日文文本
    2. 将提取的日文翻译成中文
    3. 将翻译结果写入文件
    
    用户输入:{input}
    
    使用以下工具完成任务:
    - ocr_image: 从图片提取日文
    - translate_ja_to_zh: 翻译日文到中文
    - write_to_file: 写入文件
    
    {agent_scratchpad}
    """
)

# 主函数
def main(image_path: str, output_file: str = "translated_output.txt"):
    from langchain_openai import ChatOpenAI
    
    # 初始化语言模型(这里使用模拟的LLM,实际可替换为OpenAI或其他)
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
    # 定义工具列表
    tools = [ocr_image, translate_ja_to_zh, write_to_file]
    
    # 创建代理
    agent = create_react_agent(llm, tools, prompt_template)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
    
    # 执行任务
    input_prompt = f"从图片 {image_path} 中提取日文,翻译成中文,并将结果写入 {output_file}"
    result = agent_executor.invoke({"input": input_prompt})
    
    return result["output"]

if __name__ == "__main__":
    # 示例用法
    image_path = "path_to_your_image.jpg"  # 替换为实际图片路径
    output_file = "translated_output.txt"
    result = main(image_path, output_file)
    print(f"处理完成!结果: {result}")