识别图片中的日文,并将其翻译成中文
LangGraph 实现:
import os
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# 定义状态
class AgentState(TypedDict):
image_path: str
prompt: str
extracted_text: Optional[str]
translated_text: Optional[str]
output_file: str
# 节点1:OCR提取日文
def ocr_node(state: AgentState) -> AgentState:
try:
image = Image.open(state["image_path"])
text = pytesseract.image_to_string(image, lang='jpn')
state["extracted_text"] = text.strip()
return state
except Exception as e:
state["extracted_text"] = f"OCR错误: {str(e)}"
return state
# 节点2:翻译日文到中文
def translate_node(state: AgentState) -> AgentState:
if not state["extracted_text"] or "错误" in state["extracted_text"]:
state["translated_text"] = "无法翻译: 无有效文本"
return state
try:
# 使用Hugging Face的翻译模型
model_name = "Helsinki-NLP/opus-mt-ja-zh"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
inputs = tokenizer(state["extracted_text"], return_tensors="pt", padding=True)
with torch.no_grad():
outputs = model.generate(**inputs)
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
state["translated_text"] = translated
return state
except Exception as e:
state["translated_text"] = f"翻译错误: {str(e)}"
return state
# 节点3:写入文件
def write_to_file_node(state: AgentState) -> AgentState:
try:
with open(state["output_file"], "w", encoding="utf-8") as f:
f.write(state["translated_text"] or "无翻译结果")
return state
except Exception as e:
state["translated_text"] = f"文件写入错误: {str(e)}"
return state
# 创建工作流
def create_workflow():
workflow = StateGraph(AgentState)
# 添加节点
workflow.add_node("ocr", ocr_node)
workflow.add_node("translate", translate_node)
workflow.add_node("write_to_file", write_to_file_node)
# 定义流程
workflow.set_entry_point("ocr")
workflow.add_edge("ocr", "translate")
workflow.add_edge("translate", "write_to_file")
workflow.add_edge("write_to_file", END)
return workflow.compile()
# 主函数
def main(image_path: str, output_file: str = "translated_output.txt"):
# 初始化状态
state = AgentState(
image_path=image_path,
prompt="识别图片中的日文,并将其翻译成中文",
extracted_text=None,
translated_text=None,
output_file=output_file
)
# 创建并运行工作流
app = create_workflow()
final_state = app.invoke(state)
return final_state
if __name__ == "__main__":
# 示例用法
image_path = "path_to_your_image.jpg"
output_file = "translated_output.txt"
result = main(image_path, output_file)
print(f"处理完成!翻译结果已保存至 {output_file}")
print(f"提取的日文: {result['extracted_text']}")
print(f"翻译的中文: {result['translated_text']}")
上述需求 LangChain 足够了 ,LangChain 实现:
from langchain_core.tools import tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import os
# 定义工具
@tool
def ocr_image(image_path: str) -> str:
"""从图片中提取日文文本"""
try:
image = Image.open(image_path)
text = pytesseract.image_to_string(image, lang='jpn')
return text.strip()
except Exception as e:
return f"OCR错误: {str(e)}"
@tool
def translate_ja_to_zh(text: str) -> str:
"""将日文翻译成中文"""
if not text or "错误" in text:
return "无法翻译: 无有效文本"
try:
model_name = "Helsinki-NLP/opus-mt-ja-zh"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = model.generate(**inputs)
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated
except Exception as e:
return f"翻译错误: {str(e)}"
@tool
def write_to_file(text: str, output_file: str) -> str:
"""将文本写入文件"""
try:
with open(output_file, "w", encoding="utf-8") as f:
f.write(text)
return f"成功写入文件: {output_file}"
except Exception as e:
return f"文件写入错误: {str(e)}"
# 创建提示模板
prompt_template = PromptTemplate(
input_variables=["input", "agent_scratchpad"],
template="""
你是一个能够处理图像中文本的AI助手。你的任务是:
1. 从图片中提取日文文本
2. 将提取的日文翻译成中文
3. 将翻译结果写入文件
用户输入:{input}
使用以下工具完成任务:
- ocr_image: 从图片提取日文
- translate_ja_to_zh: 翻译日文到中文
- write_to_file: 写入文件
{agent_scratchpad}
"""
)
# 主函数
def main(image_path: str, output_file: str = "translated_output.txt"):
from langchain_openai import ChatOpenAI
# 初始化语言模型(这里使用模拟的LLM,实际可替换为OpenAI或其他)
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
# 定义工具列表
tools = [ocr_image, translate_ja_to_zh, write_to_file]
# 创建代理
agent = create_react_agent(llm, tools, prompt_template)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# 执行任务
input_prompt = f"从图片 {image_path} 中提取日文,翻译成中文,并将结果写入 {output_file}"
result = agent_executor.invoke({"input": input_prompt})
return result["output"]
if __name__ == "__main__":
# 示例用法
image_path = "path_to_your_image.jpg" # 替换为实际图片路径
output_file = "translated_output.txt"
result = main(image_path, output_file)
print(f"处理完成!结果: {result}")