Files
Marlin/buildroot/share/scripts/languageExport.py
2026-02-08 14:39:25 -06:00

288 lines
13 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
'''
languageExport.py [--single] [--translate]
Export LCD language strings to CSV files for easier translation.
Use languageImport.py to import CSV into the language files.
Use --single to export all languages to a single CSV file.
'''
import re, argparse
from pathlib import Path
from sys import argv, exit
from languageUtil import *
LANGHOME = "Marlin/src/lcd/language"
OUTDIR = Path('out-csv')
def language_export(args={}):
# A dictionary to contain strings for each language.
# Init with 'en' so English will always be first.
language_strings = { 'en': {} }
# A dictionary to contain all distinct LCD string names
names = {}
# Get all "language_*.h" files
langfiles = sorted(list(Path(LANGHOME).glob('language_*.h')))
# Read each language file
for langfile in langfiles:
# Get the language code from the filename
langcode = langfile.name.replace('language_', '').replace('.h', '')
# Skip 'test' and any others that we don't want
if langcode in ['test']: continue
# Allow space-delimited list or multiple arguments
if args.language:
language_args = args.language[0].split(' ') if ' ' in args.language[0] else args.language
# Always load canonical US English and specified (or all other) languages
if langcode != 'en' and language_args and langcode not in language_args: continue
# Open the file
f = open(langfile, 'r', encoding='utf-8')
if not f: continue
# Flags to indicate a wide or tall section
wideflag, tallflag = False, False
# A counter for the number of strings in the file
stringcount = 0
# A dictionary to hold all the strings
strings = { 'narrow': {}, 'wide': {}, 'tall': {} }
# Read each line in the file
for line in f:
# Clean up the line for easier parsing
line = line.split("//")[0].strip()
if line.endswith(';'): line = line[:-1].strip()
# Check for wide or tall sections, assume no complicated nesting
if line.startswith("#endif") or line.startswith("#else"):
wideflag, tallflag = False, False
elif re.match(r'#if.*WIDTH\s*>=?\s*2[01].*', line): wideflag = True
elif re.match(r'#if.*LCD_HEIGHT\s*>=?\s*4.*', line): tallflag = True
# For string-defining lines capture the string data
match = re.match(r'LSTR\s+([A-Z0-9_]+)\s*=\s*(.+)\s*', line)
if match:
# Name and quote-sanitized value
name, value = match.group(1), match.group(2).replace('\\"', '$$$')
# Remove all _UxGT wrappers from the value in a non-greedy way
value = re.sub(r'_UxGT\((".*?")\)', r'\1', value)
# Multi-line strings get one or more bars | for identification
multiline = 0
multimatch = re.match(r'.*MSG_(\d)_LINE\s*\(\s*(.+?)\s*\).*', value)
if multimatch:
multiline = int(multimatch.group(1))
value = '|' + re.sub(r'"\s*,\s*"', '|', multimatch.group(2))
# Wrap inline defines in parentheses
value = re.sub(r' *([A-Z0-9]+_[A-Z0-9_]+) *', r'(\1)', value)
# Remove quotes around strings
value = re.sub(r'"(.*?)"', r'\1', value).replace('$$$', '""')
# Store all unique names as dictionary keys
names[name] = 1
# Store the string as narrow, wide, tall
section = 'tall' if tallflag else 'wide' if wideflag else 'narrow'
strings[section][name] = value
# Increment the string counter
stringcount += 1
# Break for testing
if args.limit and stringcount >= int(args.limit): break
# Close the file
f.close()
# Store the array in the dict
language_strings[langcode] = strings
# Get the codes of all imported languages
langcodes = list(language_strings.keys())
if args.verbose:
print("Languages:", ' '.join(langcodes))
# Print the array
#print(language_strings)
# Report the total number of unique strings
print("Found %s distinct LCD strings." % len(names))
#exit(0)
# Add missing translations, if specified
if args.translate:
MIN_TRANSLATE_LEN = 2
NEVER_TRANSLATE_LANGS = ( 'el_CY', 'fr_na' )
NEVER_TRANSLATE_NAMES = (
"MSG_MARLIN", "MSG_CUSTOM_MENU_MAIN_TITLE",
"MSG_PID_P", "MSG_PID_P_E",
"MSG_PID_I", "MSG_PID_I_E",
"MSG_PID_D", "MSG_PID_D_E",
"MSG_PID_C", "MSG_PID_C_E",
"MSG_PID_F", "MSG_PID_F_E",
"MSG_BACKLASH_N",
"MSG_SHORT_DAY", "MSG_SHORT_HOUR", "MSG_SHORT_MINUTE",
"MSG_FTM_ZV", "MSG_FTM_ZVD", "MSG_FTM_ZVDD", "MSG_FTM_ZVDDD",
"MSG_FTM_EI", "MSG_FTM_2HEI", "MSG_FTM_3HEI", "MSG_FTM_MZV"
)
import ollama
DEFAULT_MODEL = (
"qwen3:32b", # 0 22 GB
"gpt-oss:20b", # 1 13 GB
"llama3.3", # 2 45 GB
"deepseek-r1:14b", # 3 9 GB
"deepseek-r1-qwen-14b", # 4 15 GB
"devstral:24b", # 5 15 GB
"qwen3-coder:30b", # 6 18 GB
"mistral-small-3.2", # 7 14 GB
"openthinker:32b", # 8 19 GB
"deepseek-v2", # 9 9 GB
"deepseek-coder-v2", # 10 9 GB
"llama3.2:3b-instruct-fp16" # 11 6 GB
)[0]
llm_model = args.model if args.model else DEFAULT_MODEL
def get_system_prompt(args, sect):
if sect == 'narrow':
length_limit = "no more than 18 characters long! Use common abbreviations whenever necessary"
elif sect == 'tall':
length_limit = "no more than 3 strings of 20 characters. Use common abbreviations if necessary"
else: # wide
length_limit = "around the same length as the given example(s)"
no_thinking = "" if args.dothink else "Do not think! Just translate.\n"
system_prompt_text = f"""You are an expert in language translation in the context of 3D printing.
You will be given a list of existing translations and will be asked to provide a new translation in the given language.
When provided, the English (en) translation should be considered the most authoritative source.
Named variable substitutions are written as UPPERCASE_WITH_UNDERSCORES. Never translate or modify these!
The symbols `@`, `~`, `*`, `{{`, and `$` are special characters used for substitution. Never translate or modify these!
Your translations must be {length_limit}.
Assume that variable substitutions such as (MACHINE_NAME) are short strings for the purpose of character counting.
{no_thinking}For each translation requested, respond only with the translated string, no introduction, explanation, or assessment.
This clean output will be perfect for our use case."""
return [{ 'role': 'system', 'content': system_prompt_text }]
# Send a prompt to Ollama and return the reply text
def prompt_with_ollama(SYSTEM_PROMPT, prompt:str):
msg = [{ 'role': 'user', 'content': prompt }]
response = ollama.chat(model=llm_model, messages=SYSTEM_PROMPT + msg, stream=False)
reply = response['message']['content'].strip('\n')
reply = re.sub(r'<think>[\s\S]+</think>\n*', '', reply)
reply = re.sub(r'(^"|"$)', '', reply)
return reply
# For each named string fill in any missing translations
for sect in ('narrow','wide','tall'):
system_prompt = get_system_prompt(args, sect)
for name in names.keys():
if name in NEVER_TRANSLATE_NAMES: continue
en_string = language_strings['en'][sect][name] if name in language_strings['en'][sect] else ""
glyphs = len(en_string)
done = {} # All existing translations for the given name
todo = [] # Missing translation keys to create
for lang in langcodes:
strings = language_strings[lang]
if name in strings[sect]:
done[lang] = strings[sect][name]
elif glyphs >= MIN_TRANSLATE_LEN and lang not in NEVER_TRANSLATE_LANGS:
todo += [lang]
# For each untranslated language, fill in a translation
for lang in todo:
# Show existing translations to the LLM and ask for one more
prompt = [ f"Please translate the following string into {language_name(lang)} ({lang})." ]
if lang.endswith("_na"):
prompt += [ "(Substitute plain unaccented ASCII characters for accented characters in the output.)" ]
prompt += [ "Here are the existing translations:" ]
for dlang in done.keys():
prompt += [ f"- {dlang} {language_name(dlang)}: \"{done[dlang]}\"" ]
prompt = '\n'.join(prompt)
#print(f"Prompt: {prompt}")
reply = prompt_with_ollama(system_prompt, prompt)
newstring = reply.replace('','-').replace('','-').replace('','/').replace('',"'").replace('','...').replace('\u202F',' ').replace('\uFEFF', '').replace('', '! ').replace('', '. ').replace('','ç').replace('','ş').replace('','6').replace('', '@').replace('', '~')
newstring = re.sub(r'([!.]) $', '\1', newstring)
if newstring != en_string:
print(f"{name} ({lang}) = \"{newstring}\"")
done[lang] = newstring
if not sect in language_strings[lang]: language_strings[lang][sect] = {}
language_strings[lang][sect][name] = newstring
else:
print(f"{name} ({lang}) = (same as English)")
# Write a single language entry to the CSV file with narrow, wide, and tall strings
def write_csv_lang(f, strings, name):
f.write(',')
if name in strings['narrow']: f.write('"%s"' % strings['narrow'][name])
f.write(',')
if name in strings['wide']: f.write('"%s"' % strings['wide'][name])
f.write(',')
if name in strings['tall']: f.write('"%s"' % strings['tall'][name])
if args.single:
#
# Export one large sheet containing all specified languages
#
with open("languages.csv", 'w', encoding='utf-8') as f:
header = ['name']
for lang in langcodes:
lname = lang + ' ' + language_name(lang)
header += [lname, lname + ' (wide)', lname + ' (tall)']
f.write('"' + '","'.join(header) + '"\n')
for name in names.keys():
f.write('"' + name + '"')
for lang in langcodes: write_csv_lang(f, language_strings[lang], name)
f.write('\n')
else:
#
# Export a separate sheet for each language
#
OUTDIR.mkdir(exist_ok=True)
for lang in langcodes:
with open(OUTDIR / f"language_{lang}.csv", 'w', encoding='utf-8') as f:
lname = lang + ' ' + language_name(lang)
header = ['name', lname, lname + ' (wide)', lname + ' (tall)']
f.write('"' + '","'.join(header) + '"\n')
for name in names.keys():
f.write('"' + name + '"')
write_csv_lang(f, language_strings[lang], name)
f.write('\n')
if __name__ == "__main__":
# Check for the path to the language files
if not Path(LANGHOME).is_dir():
print(f"Error: Couldn't find the '{LANGHOME}' directory.")
print("Edit LANGHOME or cd to the root of the repo before running.")
exit(1)
# Parse and validate all arguments
parser = argparse.ArgumentParser(description="Export LCD language strings to CSV with optional translation")
parser.add_argument('-l', '--language', action="append", default=None, help="specify language(s) to translate from canonical English")
parser.add_argument('-s', '--single', action="store_true", help="output a single spreadsheet (languages.csv)")
parser.add_argument('-v', '--verbose', action="store_true", help="extra output for debugging")
parser.add_argument('-n', '--limit', default=0, help="limit the number of exported items")
parser.add_argument('-t', '--translate', action="store_true", help="use an LLM to translate strings")
parser.add_argument('-d', '--dothink', action="store_true", help="use thinking if the model supports it")
parser.add_argument('-m', '--model', default=None, help="override the default LLM model for translation")
args = parser.parse_args()
if not args.translate:
if args.model: print("--model ignored when not translating")
if args.dothink: print("--dothink ignored when not translating")
language_export(args)