Módulo:generar-pron/uk

La documentación para este módulo puede ser creada en Módulo:generar-pron/uk/doc
local export = {}

local insert = table.insert
local concat = table.concat

local m_str = require("Módulo:String")

local u = m_str.char
local strlower = m_str.lower
local strfind = m_str.find
local strsplit = m_str.split
local strstrip = m_str.strip
local strsubn = m_str.gsub
local strsubrep = m_str.gsub_rep
local strstrip = m_str.strip
local strlen = m_str.len
local strnfc = m_str.toNFC
local strnfd = m_str.toNFD
local strhtml = m_str.encode_html

-- version of strsubn() that discards all but the first return value
local function strsub(term, foo, bar)
	local retval = strsubn(term, foo, bar)
	return retval
end

local function list_to_set(t)
	local set = {}
	for _, item in ipairs(t) do
		set[item] = true
	end
	return set
end

local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹'´]"

local AC = u(0x301)
local GR = u(0x300)
local accents_c = "[" .. AC .. GR .. "]"

local ipa_vowel_no_i = "aɛɪuɔɐoʊe"
local ipa_vowel = ipa_vowel_no_i .. "i"
local ipa_vowel_c = "[" .. ipa_vowel .. "]"
local ipa_consonant_no_w = "bdzʒɡɦmnlrpftskxʃj"
local ipa_consonant_no_w_c = "[" .. ipa_consonant_no_w .. "]"
local ipa_consonant = ipa_consonant_no_w .. "ʋβ̞wʍ"
local ipa_consonant_c = "[" .. ipa_consonant .. "]"
local ipa_palatalizable = "tdsznlrbpʋfɡmkɦxʃʒ"
local ipa_palatalizable_c = "[" .. ipa_palatalizable .. "]"

local vowel = "аеиоуіїяєюАЕИОУІЇЯЄЮ"
local owel_c = "[" .. vowel .. "]"
local non_vowel_c = "[^" .. vowel .. "]"
local cons_except_hushing_or_ts = "бдфгґйклмнпрствхзь'БДФГҐЙКЛМНПРСТВХЗЬ"
local cons_except_hushing_or_ts_c = "[" .. cons_except_hushing_or_ts .. "]"
local hushing = "чшжщЧШЖЩ"
local hushing_c = "[" .. hushing .. "]"
local hushing_or_ts = hushing .. "цЦ"
local hushing_or_ts_c = "[" .. hushing_or_ts .. "]"
local cons = cons_except_hushing_or_ts .. hushing_or_ts
local cons_c = "[" .. cons .. "]"
-- Cyrillic velar consonants
local velar = "кгґхКГҐХ"
local velar_c = "[" .. velar .. "]"
-- uppercase Cyrillic consonants
local uppercase = "АЕИОУІЇЯЄЮБЦДФГҐЧЙКЛМНПРСТВШХЗЖЬЩ"
local uppercase_c = "[" .. uppercase .. "]"


local phonetic_chars_map = {

	-- single characters that map to IPA sounds; these are processed last
	[3] = {
		["а"] = "a",	["б"] = "b",	["в"] = "ʋ",	["г"] = "ɦ",	["ґ"] = "ɡ", 
		["д"] = "d",	["е"] = "ɛ",	["є"] = "jɛ",	["ж"] = "ʒ",	["з"] = "z", 
		["и"] = "ɪ",	["і"] = "i",	["ї"] = "ji",	["й"] = "j",	["к"] = "k", 
		["л"] = "l",	["м"] = "m",	["н"] = "n",	["о"] = "ɔ",	["п"] = "p", 
		["р"] = "r",	["с"] = "s",	["т"] = "t",	["у"] = "u",	["ф"] = "f", 
		["х"] = "x",	["ц"] = "t͡s",	["ч"] = "t͡ʃ",	["ш"] = "ʃ",	["щ"] = "ʃt͡ʃ", 
		["ь"] = "ʲ",	["ю"] = "ju",	["я"] = "ja",	["’"] = "j",
		-- accented vowels
		[AC] = "ˈ", [GR] = "ˌ",
	},

	-- character sequences of two that map to IPA sounds
	[2] = {
		["дж"] = "d͡ʒ",	["дз"] = "d͡z",
	-- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
		["дс"] = "d͡zs",   ["дш"] = "d͡ʒʃ",   ["дч"] = "d͡ʒt͡ʃ", ["дц"] = "d͡zt͡s",
		["тс"] = "t͡s",	["тш"] = "t͡ʃʃ",   ["тч"] = "t͡ʃː", ["тц"] = "t͡sː", 
	},

	-- character sequences of three that map to IPA sounds
	[1] = {
		["дзь"] = "d͡zʲ", 
	-- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
		["тьс"] = "t͡sʲː"
	},
}

local orthographic_replacements = {
	-- first apply ipa_consonant cluster simplifications that always occur orthographically
	["нтськ"	] = "ньськ",
	["стськ"	] = "ськ",
	["нтст"		] = "нст",
	["стч"		] = "шч",
	["стд"		] = "зд",
	["стс"		] = "сː",
	["#зш"		] = "#шː",
	["зш"		] = "жш",
	["#зч"		] = "#шч",
	["зч"		] = "жч",

	-- then long consonants that are orthographically geminated.
	["([бвгґд])%1"			] = "%1ː",
	["([^д]+)жж"			] = "%1жː", -- джж sequence encode diphonemic дж
	["([^д]+)зз"			] = "%1зː", -- дзз sequence encode diphonemic дз
	["([йклмнпрстфхцчшщ])%1"] = "%1ː",
	["дждж"					] = "джː",
	["дздз"					] = "дзː",
}

local voiced_obstruent = "[bdzʒɡɦ]"
local voicing = {
	["p"] = "b",
	["f"] = "v",
	["t"] = "d",
	["tʲ"] = "dʲ",
	["s"] = "z",
	["sʲ"] = "zʲ",
	["ʃ"] = "ʒ",
	["k"] = "ɡ",
	["x"] = "ɦ",
	["t͡s"] = "d͡z",
	["t͡sʲ"] = "d͡zʲ",
	["t͡ʃ"] = "d͡ʒ",
	["ʃt͡ʃ"] = "ʒd͡ʒ",
}

local grave_decomposer = {
	["ѐ"] = "е" .. GR,
	["Ѐ"] = "Е" .. GR,
	["ѝ"] = "и" .. GR,
	["Ѝ"] = "И" .. GR,
}

local pron_abc = {
	
}

-- Check if word is monosyllabic (also includes words without vowels).
local function is_monosyllabic(word)
	local num_syl = strlen(strsub(word, non_vowel_c, ""))
	return num_syl <= 1
end

-- decompose precomposed Cyrillic chars w/grave accent; not necessary for
-- acute accent as there aren't precomposed Cyrillic chars w/acute accent,
-- and undesirable for precomposed й, й, ї, Ї, etc.
local function decompose_grave(text)
	return strsub(text, "[ѐЀѝЍ]", grave_decomposer)
end

local function needs_accents(text)
	text = decompose_grave(text)
	for _, word_with_hyphens in ipairs(strsplit(text, "%s+")) do
		-- A word needs accents if it contains no accent and has more than one vowel
		-- and doesn't begin or end with a hyphen (marking a prefix or suffix)
		if not strfind(word_with_hyphens, "^%-") and not strfind(word_with_hyphens, "%-$") then
			for _, word in ipairs(strsplit(word_with_hyphens, "%-")) do
				if not strfind(word, accents_c) and not is_monosyllabic(word) then
					return true
				end
			end
		end
	end
	return false
end

local perm_syl_onset = list_to_set({
	'spr', 'str', 'skr', 'spl', 'skl',
	'sp', 'st', 'sk', 'sf', 'sx',
	'pr', 'br', 'tr', 'dr', 'kr', 'gr', 'ɦr', 'fr', 'xr',
	'pl', 'bl', 'kl', 'gl', 'ɦl', 'fl', 'xl',
})

local function convertir(p, allow_unstressed)
	p = "#" .. p .. "#"
	for regex, replacement in pairs(orthographic_replacements) do
		p = strsub(p, regex, replacement)
	end

	-- remap apostrophe to '!' so that it doesn't conflict with IPA stress mark
	p = strsub(p, "'", "!")

	-- replace multiple letter sequences
	for _, replacements in ipairs(phonetic_chars_map) do
		for key, replacement in pairs(replacements) do
			p = strsub(p, key, replacement)
		end
	end

	-- move stress mark, added by phonetic_chars_map, before vowel
	p = strsub(p, "([aɛiɪuɔ])([ˈˌ])", "%2%1")

	-- add accent if the word is monosyllabic and not allow_unstressed,
	-- so that monosyllabic words without explicit stress marks get stressed
	-- ipa_vowel allophones; we use a different character from the regular
	-- primary stress mark so we can later remove it without affecting
	-- explicitly user-added accents on monosyllabic words, as in нема́ за́ що.
	local _, numberOfVowels = strsubn(p, "[aɛiɪuɔ]", "")
	if (numberOfVowels == 1) and not allow_unstressed then
		p = strsub(p, "([aɛiɪuɔ])", "⁀%1")
	end

	-- ipa_palatalizable consonants before /i/ or /j/ become palatalized
	p = strsub(p, "(" .. ipa_palatalizable_c .. ")([ː]?)([ˈˌ⁀]?)i", "%1ʲ%2%3i")
	p = strsub(p, "(" .. ipa_palatalizable_c .. ")([ː]?)j", "%1ʲ%2")

	-- eliminate garbage sequences of [ʲːj] resulting from -тьс- cluster followed by [j]
	p = strsub(p, "ʲːj", "ʲː")

	-- ipa_consonant simplification: ст + ц' → [с'ц']. We do it here because of palatalization.
	-- Due to the т +ц → [ц:] rule length is present. According to Орфоепскі словник p. 13,
	-- both forms are proper, without length in normal (colloquial) speech and with length
	-- in slow speech, so we parenthesize the length as optional.
	p = strsub(p, "st͡sʲ([ː]?)", "sʲt͡sʲ(%1)")

	-- assimilation: voiceless + voiced = voiced + voiced
	-- should /ʋ/ be included as voiced? Орфоепічний словник doesn't voice initial cluster of шв (p. 116)
	for voiceless, voiced in pairs(voicing) do
		p = strsub(p, voiceless .. "(" .. voiced_obstruent .. "+)", voiced .. "%1")
	end

	-- In the sequence of two consonants, of which the second is soft, the first is pronounced soft too
	-- unless the first ipa_consonant is a labial, namely б, п, в, ф, м.
	p = strsub(p, "([tdsznl])(.)ʲ", "%1ʲ%2ʲ")
	p = strsub(p, "([tdsznl])t͡sʲ", "%1ʲt͡sʲ")
	p = strsub(p, "([tdsznl])d͡zʲ", "%1ʲd͡zʲ")
	p = strsub(p, "t͡s(.)ʲ", "t͡sʲ%1ʲ")
	p = strsub(p, "d͡z(.)ʲ", "d͡zʲ%1ʲ")
	p = strsub(p, "d͡zt͡sʲ", "d͡zʲt͡sʲ")
	p = strsub(p, "t͡sd͡zʲ", "t͡sʲd͡zʲ")

	-- Hushing consonants ж, ч, ш assimilate to the following hissing consonants, giving a long hissing consonant:
	-- [ʒ] + [t͡sʲ] → [zʲt͡sʲ], [t͡ʃ] + [t͡sʲ] → [t͡sʲː], [ʃ] + [t͡sʲ] → [sʲt͡sʲ], [ʃ] + [sʲ] → [sʲː]
	p = strsub(p, "ʒt͡sʲ", "zʲt͡sʲ")
	p = strsub(p, "t͡ʃt͡sʲ", "t͡sʲː")
	p = strsub(p, "ʃt͡sʲ", "sʲt͡sʲ")
	p = strsub(p, "ʃsʲ", "sʲː")

	-- Hissing consonants before hushing consonants within a word assimilate - on зш and зч word-initially and 
	-- word-medially see above.
	-- [s] + [ʃ] → [ʃː],  [z] + [ʃ] → [ʒʃ], [z] + [t͡ʃ] → [ʒt͡ʃ]
	-- [z] + [d͡ʒ] → [ʒd͡ʒ]
	p = strsub(p, "zʒ", "ʒː")
	p = strsub(p, "sʃ", "ʃː")
	p = strsub(p, "zt͡ʃ", "ʒt͡ʃ")
	p = strsub(p, "zd͡ʒ", "ʒd͡ʒ")
	p = strsub(p, "t͡ʒ", "d͡ʒ")
	p = strsub(p, "t͡z", "d͡z")

	-- cleanup: excessive palatalization: CʲCʲCʲ → CCʲCʲ
	p = strsub(p, "([^aɛiɪuɔ]+)ʲ([^aɛiɪuɔ]+)ʲ([^aɛiɪuɔ]+)ʲ", "%1%2ʲ%3ʲ")

	-- unstressed /a/ has an allophone [ɐ]
	p = strsub(p, "([^ˈˌ⁀])a", "%1ɐ")
	-- unstressed /u/ has an allophone [ʊ]
	p = strsub(p, "([^ˈˌ⁀])u", "%1ʊ")
	-- unstressed /ɔ/ has by assimilation an allophone [o] before a stressed syllable with /u/ or /i/
	p = strsub(p, "ɔ([bdzʒɡɦmnlrpftskxʲʃ͡]+)([ˈˌ⁀][uiʊ])", "o%1%2")
	-- one allophone [e] covers unstressed /ɛ/ and /ɪ/
	p = strsub(p, "([^ˈˌ⁀])[ɛɪ]", "%1e")

	-- Remove the monosyllabic stress we auto-added to ensure that vowels in
	-- monosyllabic words get stressed allophones. Do this before vocalizing
	-- /ʋ/ and /j/. NOTE: Nothing below should depend on stress marks being
	-- present.
	p = strsub(p, "⁀", "")

	-- /ʋ/ has an allophone [u̯] in a syllable coda
	p = strsub(p, "(" .. ipa_vowel_c .. ")ʋ([" .. ipa_consonant_no_w .. "#])", "%1u̯%2")
	-- /ʋ/ has an allophone [w] before /ɔ, u/ and voiced consonants (not after a vowel; [ʋ] before ipa_vowel already converted)
	p = strsub(p, "ʋ([ˈˌ]?[ɔuoʊbdzʒɡɦmnlr])", "w%1")
	-- /ʋ/ has an allophone [β̞] before remaining vowels besides /i/
	-- Not sure whether this looks good.
	-- p = strsub(p, "ʋ([ˈˌʲ]*[" .. ipa_vowel_no_i .. "])", "β̞%1")
	-- /ʋ/ has an allophone [ʍ] before before voiceless consonants (not after a vowel; [ʋ] before ipa_vowel already converted)
	p = strsub(p, "ʋ([pftskxʃ])", "ʍ%1")

	-- in a syllable-final position (i.e. the first position of a syllable coda) /j/ has an allophone [i̯]:
	p = strsub(p, "(" .. ipa_vowel_c .. ")j([" .. ipa_consonant_no_w .. "#])", "%1i̯%2")
	-- also at the beginning of a word before a consonant
	p = strsub(p, "#j(" .. ipa_consonant_no_w_c .. ")", "#i̯%1")

	-- remove old orthographic apostrophe
	p = strsub(p, "!", "")
	-- stress mark in correct place
	-- (1) Put the stress mark before the final ipa_consonant of a cluster (if any).
	p = strsub(p, "([^#" .. ipa_vowel .. "]?[ʲː]*)([ˈˌ])", "%2%1")
	-- (2) Continue moving it over the rest of an affricate with a tie bar.
	p = strsub(p, "([^#" .. ipa_vowel .. "]͡)([ˈˌ])", "%2%1")
	-- (3) Continue moving it over any "permanent onset" clusters (e.g. st, skr, pl, also Cj).
	p = strsub(p, "(.)(ʲ?)(" .. ipa_consonant_c .. ")(ʲ?)([ˈˌ])(" .. ipa_consonant_c .. ")",
		function(a, aj, b, bj, stress, c)
			if perm_syl_onset[a .. b .. c] then
				return stress .. a .. aj .. b .. bj .. c
			elseif perm_syl_onset[b .. c] or c == "j" then
				return a .. aj .. stress .. b .. bj .. c
			else
				return a .. aj .. b .. bj .. stress .. c
			end
		end)
	-- (4) If we're in the middle of an affricate with a tie bar, continue moving back
	--     if the following ipa_consonant is /j/, else move forward.
	p = strsub(p, "([^#" .. ipa_vowel .. "]͡)([ˈˌ])(.ʲ?j)", "%2%1%3")
	p = strsub(p, "([^#" .. ipa_vowel .. "]͡)([ˈˌ])(.ʲ?)", "%1%3%2")
	-- (5) Move back over any remaining consonants at the beginning of a word.
	p = strsub(p, "#([^#" .. ipa_vowel .. "]+)([ˈˌ])", "#%2%1")
	-- (6) Move back over u̯ or i̯ at the beginning of a word.
	p = strsub(p, "#([ui]̯)([ˈˌ])", "#%2%1")

	p = strsub(p, "ʲ?ːʲ", "ʲː")

	-- use dark [ɫ] for non-palatal /l/
	p = strsub(p, "l([^ʲ])", "ɫ%1")

	return strsub(p, "#", "")
end

local function normalizar(texto)
	texto = strlower(texto)
	texto = decompose_grave(texto)
	texto = strsubrep(texto, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
	texto = strsubrep(texto, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado
	texto = strsubrep(texto, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)

    texto = strsubrep(texto, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
    texto = strsubrep(texto, "%s+", " ")
	texto = strstrip(texto, "[%s|]+")
	
	return texto
end

local function generar_pron(text, allow_unstressed)
	text = normalizar(text)

	if not allow_unstressed and needs_accents(text) then
		error("Multisyllabic words that are not prefixes or suffixes must have an acute accent marking the stress: " .. text)
	end

	local convertido = {}
	local fragmentos = strsplit(text, "%s*|%s*")

	for _,fragmento in ipairs(fragmentos) do
		local palabras = strsplit(fragmento, "%s")
		local palabras_convertidas = {}
		for _,p in ipairs(palabras) do
			insert(palabras_convertidas, convertir(p, allow_unstressed))
		end
		insert(convertido, concat(palabras_convertidas, " "))
	end

	return {{strhtml(concat(convertido, " | "))}}
end

function export.procesar_pron_args(titulo, args)
	local tit = titulo
	local vino_ayuda, x

	if #args["ayuda"] < 1 then
		args["ayuda"][1] = tit
	else
		vino_ayuda = true
	end

	if #args["fone"] < 1 and #args["fono"] < 1 then
		x = pron_abc[args["ayuda"][1]]
		if x then
			args["ayuda"] = x
			args["tl"] = x
		end

		local A = #args["ayuda"]
		local j = 1 -- indice de la ayuda
		local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
		while k <= 9 and j <= A do
			local fone = generar_pron(args["ayuda"][j], true)
			for i,_ in ipairs(fone) do
				insert(args["fone"], fone[i])
				k = k + 1
				if k > 9 then
					break
				end
			end
			j = j + 1
		end
	end

	local tiene_espacios = strfind(tit, "%s")
	if args["fone"][1] and args["fone"][1][1] then
		local rim = strsub(args["fone"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
		rim = strsub(rim, "^.*ˈ(.-)$", "%1")
		args["rima"] = strsub(rim, ".-".."(["..ipa_vowel.."].*"..")".."$", "%1")
	end

	return args
end


return export