forked from Mirrors/Marlin
288 lines
13 KiB
Python
Executable File
288 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
'''
|
||
languageExport.py [--single] [--translate]
|
||
|
||
Export LCD language strings to CSV files for easier translation.
|
||
Use languageImport.py to import CSV into the language files.
|
||
|
||
Use --single to export all languages to a single CSV file.
|
||
'''
|
||
|
||
import re, argparse
|
||
from pathlib import Path
|
||
from sys import argv, exit
|
||
from languageUtil import *
|
||
|
||
LANGHOME = "Marlin/src/lcd/language"
|
||
OUTDIR = Path('out-csv')
|
||
|
||
def language_export(args={}):
|
||
# A dictionary to contain strings for each language.
|
||
# Init with 'en' so English will always be first.
|
||
language_strings = { 'en': {} }
|
||
|
||
# A dictionary to contain all distinct LCD string names
|
||
names = {}
|
||
|
||
# Get all "language_*.h" files
|
||
langfiles = sorted(list(Path(LANGHOME).glob('language_*.h')))
|
||
|
||
# Read each language file
|
||
for langfile in langfiles:
|
||
# Get the language code from the filename
|
||
langcode = langfile.name.replace('language_', '').replace('.h', '')
|
||
|
||
# Skip 'test' and any others that we don't want
|
||
if langcode in ['test']: continue
|
||
|
||
# Allow space-delimited list or multiple arguments
|
||
if args.language:
|
||
language_args = args.language[0].split(' ') if ' ' in args.language[0] else args.language
|
||
|
||
# Always load canonical US English and specified (or all other) languages
|
||
if langcode != 'en' and language_args and langcode not in language_args: continue
|
||
|
||
# Open the file
|
||
f = open(langfile, 'r', encoding='utf-8')
|
||
if not f: continue
|
||
|
||
# Flags to indicate a wide or tall section
|
||
wideflag, tallflag = False, False
|
||
# A counter for the number of strings in the file
|
||
stringcount = 0
|
||
# A dictionary to hold all the strings
|
||
strings = { 'narrow': {}, 'wide': {}, 'tall': {} }
|
||
# Read each line in the file
|
||
for line in f:
|
||
# Clean up the line for easier parsing
|
||
line = line.split("//")[0].strip()
|
||
if line.endswith(';'): line = line[:-1].strip()
|
||
|
||
# Check for wide or tall sections, assume no complicated nesting
|
||
if line.startswith("#endif") or line.startswith("#else"):
|
||
wideflag, tallflag = False, False
|
||
elif re.match(r'#if.*WIDTH\s*>=?\s*2[01].*', line): wideflag = True
|
||
elif re.match(r'#if.*LCD_HEIGHT\s*>=?\s*4.*', line): tallflag = True
|
||
|
||
# For string-defining lines capture the string data
|
||
match = re.match(r'LSTR\s+([A-Z0-9_]+)\s*=\s*(.+)\s*', line)
|
||
if match:
|
||
# Name and quote-sanitized value
|
||
name, value = match.group(1), match.group(2).replace('\\"', '$$$')
|
||
|
||
# Remove all _UxGT wrappers from the value in a non-greedy way
|
||
value = re.sub(r'_UxGT\((".*?")\)', r'\1', value)
|
||
|
||
# Multi-line strings get one or more bars | for identification
|
||
multiline = 0
|
||
multimatch = re.match(r'.*MSG_(\d)_LINE\s*\(\s*(.+?)\s*\).*', value)
|
||
if multimatch:
|
||
multiline = int(multimatch.group(1))
|
||
value = '|' + re.sub(r'"\s*,\s*"', '|', multimatch.group(2))
|
||
|
||
# Wrap inline defines in parentheses
|
||
value = re.sub(r' *([A-Z0-9]+_[A-Z0-9_]+) *', r'(\1)', value)
|
||
# Remove quotes around strings
|
||
value = re.sub(r'"(.*?)"', r'\1', value).replace('$$$', '""')
|
||
# Store all unique names as dictionary keys
|
||
names[name] = 1
|
||
# Store the string as narrow, wide, tall
|
||
section = 'tall' if tallflag else 'wide' if wideflag else 'narrow'
|
||
strings[section][name] = value
|
||
|
||
# Increment the string counter
|
||
stringcount += 1
|
||
# Break for testing
|
||
if args.limit and stringcount >= int(args.limit): break
|
||
|
||
# Close the file
|
||
f.close()
|
||
# Store the array in the dict
|
||
language_strings[langcode] = strings
|
||
|
||
# Get the codes of all imported languages
|
||
langcodes = list(language_strings.keys())
|
||
|
||
if args.verbose:
|
||
print("Languages:", ' '.join(langcodes))
|
||
|
||
# Print the array
|
||
#print(language_strings)
|
||
|
||
# Report the total number of unique strings
|
||
print("Found %s distinct LCD strings." % len(names))
|
||
|
||
#exit(0)
|
||
|
||
# Add missing translations, if specified
|
||
if args.translate:
|
||
|
||
MIN_TRANSLATE_LEN = 2
|
||
NEVER_TRANSLATE_LANGS = ( 'el_CY', 'fr_na' )
|
||
NEVER_TRANSLATE_NAMES = (
|
||
"MSG_MARLIN", "MSG_CUSTOM_MENU_MAIN_TITLE",
|
||
"MSG_PID_P", "MSG_PID_P_E",
|
||
"MSG_PID_I", "MSG_PID_I_E",
|
||
"MSG_PID_D", "MSG_PID_D_E",
|
||
"MSG_PID_C", "MSG_PID_C_E",
|
||
"MSG_PID_F", "MSG_PID_F_E",
|
||
"MSG_BACKLASH_N",
|
||
"MSG_SHORT_DAY", "MSG_SHORT_HOUR", "MSG_SHORT_MINUTE",
|
||
"MSG_FTM_ZV", "MSG_FTM_ZVD", "MSG_FTM_ZVDD", "MSG_FTM_ZVDDD",
|
||
"MSG_FTM_EI", "MSG_FTM_2HEI", "MSG_FTM_3HEI", "MSG_FTM_MZV"
|
||
)
|
||
|
||
import ollama
|
||
|
||
DEFAULT_MODEL = (
|
||
"qwen3:32b", # 0 22 GB
|
||
"gpt-oss:20b", # 1 13 GB
|
||
"llama3.3", # 2 45 GB
|
||
"deepseek-r1:14b", # 3 9 GB
|
||
"deepseek-r1-qwen-14b", # 4 15 GB
|
||
"devstral:24b", # 5 15 GB
|
||
"qwen3-coder:30b", # 6 18 GB
|
||
"mistral-small-3.2", # 7 14 GB
|
||
"openthinker:32b", # 8 19 GB
|
||
"deepseek-v2", # 9 9 GB
|
||
"deepseek-coder-v2", # 10 9 GB
|
||
"llama3.2:3b-instruct-fp16" # 11 6 GB
|
||
)[0]
|
||
|
||
llm_model = args.model if args.model else DEFAULT_MODEL
|
||
|
||
def get_system_prompt(args, sect):
|
||
if sect == 'narrow':
|
||
length_limit = "no more than 18 characters long! Use common abbreviations whenever necessary"
|
||
elif sect == 'tall':
|
||
length_limit = "no more than 3 strings of 20 characters. Use common abbreviations if necessary"
|
||
else: # wide
|
||
length_limit = "around the same length as the given example(s)"
|
||
|
||
no_thinking = "" if args.dothink else "Do not think! Just translate.\n"
|
||
|
||
system_prompt_text = f"""You are an expert in language translation in the context of 3D printing.
|
||
You will be given a list of existing translations and will be asked to provide a new translation in the given language.
|
||
When provided, the English (en) translation should be considered the most authoritative source.
|
||
Named variable substitutions are written as UPPERCASE_WITH_UNDERSCORES. Never translate or modify these!
|
||
The symbols `@`, `~`, `*`, `{{`, and `$` are special characters used for substitution. Never translate or modify these!
|
||
Your translations must be {length_limit}.
|
||
Assume that variable substitutions such as (MACHINE_NAME) are short strings for the purpose of character counting.
|
||
{no_thinking}For each translation requested, respond only with the translated string, no introduction, explanation, or assessment.
|
||
This clean output will be perfect for our use case."""
|
||
|
||
return [{ 'role': 'system', 'content': system_prompt_text }]
|
||
|
||
# Send a prompt to Ollama and return the reply text
|
||
def prompt_with_ollama(SYSTEM_PROMPT, prompt:str):
|
||
msg = [{ 'role': 'user', 'content': prompt }]
|
||
response = ollama.chat(model=llm_model, messages=SYSTEM_PROMPT + msg, stream=False)
|
||
reply = response['message']['content'].strip('\n')
|
||
reply = re.sub(r'<think>[\s\S]+</think>\n*', '', reply)
|
||
reply = re.sub(r'(^"|"$)', '', reply)
|
||
return reply
|
||
|
||
# For each named string fill in any missing translations
|
||
for sect in ('narrow','wide','tall'):
|
||
system_prompt = get_system_prompt(args, sect)
|
||
for name in names.keys():
|
||
if name in NEVER_TRANSLATE_NAMES: continue
|
||
en_string = language_strings['en'][sect][name] if name in language_strings['en'][sect] else ""
|
||
glyphs = len(en_string)
|
||
|
||
done = {} # All existing translations for the given name
|
||
todo = [] # Missing translation keys to create
|
||
for lang in langcodes:
|
||
strings = language_strings[lang]
|
||
if name in strings[sect]:
|
||
done[lang] = strings[sect][name]
|
||
elif glyphs >= MIN_TRANSLATE_LEN and lang not in NEVER_TRANSLATE_LANGS:
|
||
todo += [lang]
|
||
|
||
# For each untranslated language, fill in a translation
|
||
for lang in todo:
|
||
# Show existing translations to the LLM and ask for one more
|
||
prompt = [ f"Please translate the following string into {language_name(lang)} ({lang})." ]
|
||
if lang.endswith("_na"):
|
||
prompt += [ "(Substitute plain unaccented ASCII characters for accented characters in the output.)" ]
|
||
prompt += [ "Here are the existing translations:" ]
|
||
for dlang in done.keys():
|
||
prompt += [ f"- {dlang} {language_name(dlang)}: \"{done[dlang]}\"" ]
|
||
prompt = '\n'.join(prompt)
|
||
#print(f"Prompt: {prompt}")
|
||
reply = prompt_with_ollama(system_prompt, prompt)
|
||
newstring = reply.replace('–','-').replace('‑','-').replace('/','/').replace('’',"'").replace('…','...').replace('\u202F',' ').replace('\uFEFF', '').replace('!', '! ').replace('。', '. ').replace('ç','ç').replace('ş','ş').replace('6','6').replace('@', '@').replace('~', '~')
|
||
newstring = re.sub(r'([!.]) $', '\1', newstring)
|
||
if newstring != en_string:
|
||
print(f"{name} ({lang}) = \"{newstring}\"")
|
||
done[lang] = newstring
|
||
if not sect in language_strings[lang]: language_strings[lang][sect] = {}
|
||
language_strings[lang][sect][name] = newstring
|
||
else:
|
||
print(f"{name} ({lang}) = (same as English)")
|
||
|
||
# Write a single language entry to the CSV file with narrow, wide, and tall strings
|
||
def write_csv_lang(f, strings, name):
|
||
f.write(',')
|
||
if name in strings['narrow']: f.write('"%s"' % strings['narrow'][name])
|
||
f.write(',')
|
||
if name in strings['wide']: f.write('"%s"' % strings['wide'][name])
|
||
f.write(',')
|
||
if name in strings['tall']: f.write('"%s"' % strings['tall'][name])
|
||
|
||
if args.single:
|
||
#
|
||
# Export one large sheet containing all specified languages
|
||
#
|
||
with open("languages.csv", 'w', encoding='utf-8') as f:
|
||
header = ['name']
|
||
for lang in langcodes:
|
||
lname = lang + ' ' + language_name(lang)
|
||
header += [lname, lname + ' (wide)', lname + ' (tall)']
|
||
f.write('"' + '","'.join(header) + '"\n')
|
||
|
||
for name in names.keys():
|
||
f.write('"' + name + '"')
|
||
for lang in langcodes: write_csv_lang(f, language_strings[lang], name)
|
||
f.write('\n')
|
||
else:
|
||
#
|
||
# Export a separate sheet for each language
|
||
#
|
||
OUTDIR.mkdir(exist_ok=True)
|
||
|
||
for lang in langcodes:
|
||
with open(OUTDIR / f"language_{lang}.csv", 'w', encoding='utf-8') as f:
|
||
lname = lang + ' ' + language_name(lang)
|
||
header = ['name', lname, lname + ' (wide)', lname + ' (tall)']
|
||
f.write('"' + '","'.join(header) + '"\n')
|
||
|
||
for name in names.keys():
|
||
f.write('"' + name + '"')
|
||
write_csv_lang(f, language_strings[lang], name)
|
||
f.write('\n')
|
||
|
||
if __name__ == "__main__":
|
||
# Check for the path to the language files
|
||
if not Path(LANGHOME).is_dir():
|
||
print(f"Error: Couldn't find the '{LANGHOME}' directory.")
|
||
print("Edit LANGHOME or cd to the root of the repo before running.")
|
||
exit(1)
|
||
|
||
# Parse and validate all arguments
|
||
parser = argparse.ArgumentParser(description="Export LCD language strings to CSV with optional translation")
|
||
parser.add_argument('-l', '--language', action="append", default=None, help="specify language(s) to translate from canonical English")
|
||
parser.add_argument('-s', '--single', action="store_true", help="output a single spreadsheet (languages.csv)")
|
||
parser.add_argument('-v', '--verbose', action="store_true", help="extra output for debugging")
|
||
parser.add_argument('-n', '--limit', default=0, help="limit the number of exported items")
|
||
parser.add_argument('-t', '--translate', action="store_true", help="use an LLM to translate strings")
|
||
parser.add_argument('-d', '--dothink', action="store_true", help="use thinking if the model supports it")
|
||
parser.add_argument('-m', '--model', default=None, help="override the default LLM model for translation")
|
||
args = parser.parse_args()
|
||
|
||
if not args.translate:
|
||
if args.model: print("--model ignored when not translating")
|
||
if args.dothink: print("--dothink ignored when not translating")
|
||
|
||
language_export(args)
|