Módulo:generar-pron/sk

La documentación para este módulo puede ser creada en Módulo:generar-pron/sk/doc
local export = {}

local insert = table.insert
local concat = table.concat

local m_str = require("Módulo:String")

local U = m_str.char
local strlower = m_str.lower
local strsplit = m_str.split
local strlen = m_str.ulen
local substr = m_str.sub
local strfind = m_str.find
local strmatch = m_str.match
local strmatchit = m_str.gmatch
local strsubn = m_str.gsub
local strsubrep = m_str.gsub_rep
local strstrip = m_str.strip
local strnfd = m_str.toNFD
local strnfc = m_str.toNFC
local strhtml = m_str.encode_html

-- sustitución descartando todo salvo el string retornado
local function strsub(text, pattern, repl, n)
    local t, _ = strsubn(text, pattern, repl, n)
    return t
end

local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹'´]"

local long = "ː"
local nonsyllabic = U(0x32F)	-- inverted breve below
local syllabic = U(0x0329)
local tie = U(0x361)			-- combining double inverted breve
local primary_stress = "ˈ"
local secondary_stress = "ˌ"

local data = {
	["á"] = "a" .. long,
	["ä"] = "ɛɐ" .. nonsyllabic,
	["c"] = "t" .. tie .. "s",
	["č"] = "t" .. tie .. "ʃ",
	["ď"] = "ɟ",
	["dz"] = "d" .. tie .. "z",
	["dž"] = "d" .. tie .. "ʒ",
	["é"] = "e" .. long,
	["g"] = "ɡ",
	["h"] = "ɦ",
	["ch"] = "x",
	["í"] = "i" .. long,
	["ĺ"] = "l" .. syllabic .. long,
	["ľ"] = "ʎ",
	["ň"] = "ɲ",
	["o"] = "ɔ",
	["ó"] = "o" .. long,
	["ô"] = "u" .. nonsyllabic .. "o",
	["ö"] = "œ",
	["q"] = "kv",
	["ŕ"] = "r" .. syllabic .. long,
	["š"] = "ʃ",
	["ť"] = "c",
	["ú"] = "u" .. long,
	["ü"] = "y",
	["w"] = "v",
	["x"] = "ks",
	["y"] = "i",
	["ý"] = "i" .. long,
	["ž"] = "ʒ",
	["ia"] = "i" .. nonsyllabic .. "a",
	["ie"] = "i" .. nonsyllabic .. "e",
	["iu"] = "i" .. nonsyllabic .. "u",
	["\""] = primary_stress,
	["%"] = secondary_stress,
}

-- Add data["a"] = "a", data["b"] = "b", etc.
for character in strmatchit("abdefijklmnprstuvz ", ".") do
	data[character] = character
end

--Define alveolars, front vowels, and palatalized consonants
local alveolars_ortho = { "t", "d", "n"}
local front_vowels_ortho = { "e", "é", "i", "í", "ie", "ia"}

local palatalized_ortho = {
    ["t"] = "ť",
    ["d"] = "ď",
    ["n"] = "ň",
}

local UTF8Char = "[\1-\127\194-\244][\128-\191]*"
local nonsyllabicDiacritics = U(0x311) .. U(0x32F)
local syllabicDiacritics = U(0x0329) .. U(0x030D)
local nonsyllabicDiacritic = "[" .. nonsyllabicDiacritics .. "]"
local syllabicDiacritic = "[" .. syllabicDiacritics .. "]"
local vowels_IPA = "iyɨʉɯuɪʏʊeøɘɵɤoəɚɛœɜɝɞʌɔæɐaɶɑɒäëïöüÿᵻᵿ" -- genérico, son las vocales fonéticas de todos los idiomas
local vowel_IPA = "["..vowels_IPA.."]"

local function getVowels(remainder)
	if string.find(remainder, "^[%/%[]?%-") or string.find(remainder, "%-[%/%]]?$") then
		return nil
	end	-- If a hyphen is at the beginning or end of the transcription, do not count syllables.

	local count = 0
	--local diphs = diphthongs[lang:getCode()] or {}

	remainder = strnfd(remainder)
	remainder = string.gsub(remainder, "%((.*)%)", "%1") -- Remove parentheses.

	while remainder ~= "" do
		-- Ignore nonsyllabic vowels
		remainder = strsub(remainder, "^" .. vowel_IPA .. nonsyllabicDiacritic, "")

		local m =
			strmatch(remainder, "^." .. syllabicDiacritic) or  -- Syllabic consonant
			strmatch(remainder, "^" .. vowel_IPA .. tie .. vowel_IPA)  -- Tie bar

		-- Starts with a recognised diphthong?
		--[=[
		for _, diph in ipairs(diphs) do
			if m then
				break
			end

			m = m or mw.ustring.match(remainder, "^" .. diph)
		end
		--]=]

		-- If we haven't found anything yet, just match on a single vowel
		m = m or strmatch(remainder, "^" .. vowel_IPA)

		if m then
			-- Found a vowel, add it
			count = count + 1
			remainder = substr(remainder, #m + 1)
		else
			-- Found a non-vowel, skip it
			remainder = substr(remainder, 2)
		end
	end

	if count ~= 0 then return count end

	return nil

end

local pron_abc = {
	["A"] = {"á"},
	["a"] = {"á"},
	["Á"] = {"dlhé á"},
	["á"] = {"dlhé á"},
	["Ä"] = {"prehlasované á", "a s dvoma bodkami", "široké e"},
	["ä"] = {"prehlasované á", "a s dvoma bodkami", "široké e"},
	["B"] = {"bé"},
	["b"] = {"bé"},
	["C"] = {"cé"},
	["c"] = {"cé"},
	["Č"] = {"čé"},
	["č"] = {"čé"},
	["D"] = {"dé"},
	["d"] = {"dé"},
	["Ď"] = {"ďé", "mäkké dé"},
	["ď"] = {"ďé", "mäkké dé"},
	["Dz"] = {"dzé"},
	["dz"] = {"dzé"},
	["Dž"] = {"džé"},
	["dž"] = {"džé"},
	["E"] = {"é"},
	["e"] = {"é"},
	["É"] = {"dlhé é"},
	["é"] = {"dlhé é"},
	["F"] = {"ef"},
	["f"] = {"ef"},
	["G"] = {"gé"},
	["g"] = {"gé"},
	["H"] = {"há"},
	["h"] = {"há"},
	["Ch"] = {"chá"},
	["ch"] = {"chá"},
	["I"] = {"í"},
	["i"] = {"í"},
	["Í"] = {"dlhé í"},
	["í"] = {"dlhé í"},
	["J"] = {"jé"},
	["j"] = {"jé"},
	["K"] = {"ká"},
	["k"] = {"ká"},
	["L"] = {"el"},
	["l"] = {"el"},
	["Ĺ"] = {"dlhé el"},
	["ĺ"] = {"dlhé el"},
	["Ľ"] = {"eľ", "mäkké el"},
	["ľ"] = {"eľ", "mäkké el"},
	["M"] = {"em"},
	["m"] = {"em"},
	["N"] = {"en"},
	["n"] = {"en"},
	["Ň"] = {"eň"},
	["ň"] = {"eň"},
	["O"] = {"o"},
	["o"] = {"o"},
	["Ó"] = {"ó", "dlhé o"},
	["ó"] = {"ó", "dlhé o"},
	["Ô"] = {"ô"},
	["ô"] = {"ô"},
	["P"] = {"pé"},
	["p"] = {"pé"},
	["Q"] = {"kvé"},
	["q"] = {"kvé"},
	["R"] = {"er"},
	["r"] = {"er"},
	["Ŕ"] = {"dlhé er"},
	["ŕ"] = {"dlhé er"},
	["S"] = {"es"},
	["s"] = {"es"},
	["Š"] = {"eš"},
	["š"] = {"eš"},
	["T"] = {"té"},
	["t"] = {"té"},
	["Ť"] = {"ťé", "mäkké té"},
	["ť"] = {"ťé", "mäkké té"},
	["U"] = {"u"},
	["u"] = {"u"},
	["Ú"] = {"dlhé ú"},
	["ú"] = {"dlhé ú"},
	["V"] = {"vé"},
	["v"] = {"vé"},
	["W"] = {"dvojité vé"},
	["w"] = {"dvojité vé"},
	["X"] = {"iks"},
	["x"] = {"iks"},
	["Y"] = {"ypsilon", "i griego"},
	["y"] = {"ypsilon", "i griego"},
	["Ý"] = {"dlhý ypsilon", "dlhý i griego"},
	["ý"] = {"dlhý ypsilon", "dlhý i griego"},
	["Z"] = {"zet"},
	["z"] = {"zet"},
	["Ž"] = {"žet"},
	["ž"] = {"žet"},
}


-- [==[			Phonological rules		]==]

--[[
This is used to replace multiple-character sounds
with numbers, which makes it easier to process them.	]]

local multiple_char = {
	"t" .. tie .. "s",	"t" .. tie .. "ʃ", "d" .. tie .. "z",	"d" .. tie .. "ʒ",
}

local singlechar = {}
for number, character in pairs(multiple_char) do
	singlechar[character] = tostring(number)
end

local voiceless	= { "p", "t", "c", "k", "f", "s", "ʃ", "x", "1", "2", }
local voiced	= { "b", "d", "ɟ", "ɡ", "v", "z", "ʒ", "ɦ", "3", "4", }
local sonorants = { "m", "n", "ɲ", "r", "l", "ʎ", "j", }

local features = {}
local indices = {}
for i, consonant in pairs(voiceless) do
	if not features[consonant] then
		features[consonant] = {}
	end
	features[consonant]["voicing"] = false
	indices[consonant] = i
end

for i, consonant in pairs (voiced) do
	if not features[consonant] then
		features[consonant] = {}
	end
	features[consonant]["voicing"] = true
	indices[consonant] = i
end

local function palatalize_orthography(term)
    for _, alv in ipairs(alveolars_ortho) do
        for _, vow in ipairs(front_vowels_ortho) do
            local pattern = alv .. vow
            local replacement = palatalized_ortho[alv] .. vow
            term = strsub(term, pattern, replacement)
        end
    end
    return term
end

local function devoice_finally(IPA)
	local voiced_obstruent = "[" .. concat(voiced) .. "]"

	local final_voiced_obstruent = strmatch(IPA, voiced_obstruent .. "+$") or strmatch(IPA, voiced_obstruent .. "+%s")

	if final_voiced_obstruent then
		local replacement = {}

		local length = strlen(final_voiced_obstruent)

		for i = 1, length do
			local consonant = substr(final_voiced_obstruent, i, i)
			local index = indices[consonant]
			local devoiced = voiceless[index]

			insert(replacement, devoiced)
		end

		local replacement_str = concat(replacement)
		-- This will cause problems if the same consonant cluster occurs elsewhere in the term.
		IPA = strsub(IPA, final_voiced_obstruent, replacement_str)
	end

	return IPA
end

local function syllabicize_sonorants(IPA)
	local sonorant = strsub("[" .. concat(sonorants) .. "]", "[ɲʎj]", "") -- all except ɲ and ʎ and j
	local obstruent = "[" .. concat(voiced) .. concat(voiceless) .. "]"
	local consonant = "[" .. strsub(sonorant .. obstruent, "[%[%]]", "") .. "]"

	-- between a consonant and an obstruent
	IPA = strsub(IPA, "(" .. consonant .. sonorant .. ")(" .. obstruent .. ")", "%1" .. syllabic .. "%2")
	-- at the beginning of a word before an obstruent
	IPA = strsub(IPA, "^(" .. sonorant .. ")(" .. obstruent .. ")", "%1" .. syllabic .. "%2")
	-- at the end of a word after an obstruent
	IPA = strsub(IPA, "(" .. obstruent .. sonorant .. ")$", "%1" .. syllabic)

	return IPA
end

local function add_stress(IPA)
	local syllable_count = getVowels(IPA)

	if not syllable_count then
		-- words like “čln” or “v” contain no designated vowels, yet they are valid Slovak words
		syllable_count = 1
	end

	if syllable_count > 1 and not strfind(IPA, " ") then
		IPA = primary_stress .. IPA
	end

	return IPA
end

local function apply_rules(IPA)
	-- Replace multiple-character units with numbers.
	for sound, character in pairs(singlechar) do
		IPA = strsub(IPA, sound, character)
	end

	IPA = devoice_finally(IPA)
	IPA = syllabicize_sonorants(IPA)
	IPA = add_stress(IPA)

	-- Change double to single consonants.
	local consonant = "[" .. concat(sonorants) .. concat(voiceless) .. concat(voiced) .. "]"
	IPA = strsub(IPA, "(" .. consonant .. ")%1", "%1")

	-- Replace numbers with multiple-character units.
	for sound, character in pairs(singlechar) do
		IPA = strsub(IPA, character, sound)
	end

	return IPA
end

local function normalizar(texto)
	texto = strlower(texto)
	texto = palatalize_orthography(texto)
	texto = strsubrep(texto, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
	texto = strsubrep(texto, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado
	texto = strsubrep(texto, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)

    texto = strsubrep(texto, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
    texto = strsubrep(texto, "%s+", " ")
	texto = strstrip(texto, "[%s|]+")
	
	return texto
end

local function generar_pron(term)
	term = normalizar(term)
	local fragmentos_convertidos = {}
	local fragmentos = strsplit(term, "%s*|%s*")
	
	for _,fragmento in ipairs(fragmentos) do
	    local palabras = strsplit(fragmento, "%s")
		local palabras_convertidas = {}
	    for debu,p in ipairs(palabras) do
			local working_string = p
			local convertido = ""
		
			while strlen(working_string) > 0 do
				local IPA_letter
		
				local letter = substr(working_string, 1, 1)
				local twoletters = substr(working_string, 1, 2) or ""
		
				if data[twoletters] then
					IPA_letter = data[twoletters]
					working_string = substr(working_string, 3)
				else
					IPA_letter = data[letter] or error('The letter "' .. tostring(letter) .. '" is not a member of the Slovak alphabet.')
					working_string = substr(working_string, 2)
				end
		
				convertido = convertido..IPA_letter
			end
			insert(palabras_convertidas, convertido)
	    end
	    insert(fragmentos_convertidos, concat(palabras_convertidas, " "))
	end

	local IPAstr = concat(fragmentos_convertidos, " | ")
	IPAstr = apply_rules(IPAstr)

	return {{strhtml(IPAstr)}}, {{strhtml(term)}}
end

function export.procesar_pron_args(titulo, args)
	local tit = titulo
	local vino_ayuda, x

	if #args["ayuda"] < 1 then
		args["ayuda"][1] = tit
	else
		vino_ayuda = true
	end

	if #args["fone"] < 1 and #args["fono"] < 1 then
		x = pron_abc[args["ayuda"][1]]
		if x then
			args["ayuda"] = x
			args["tl"] = x
		end

		local A = #args["ayuda"]
		local j = 1 -- indice de la ayuda
		local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
		while k <= 9 and j <= A do
			local fono, fgraf = generar_pron(args["ayuda"][j])
			for i,_ in ipairs(fono) do
				insert(args["fono"], fono[i])
				insert(args["fgraf"], fgraf[i])
				k = k + 1
				if k > 9 then
					break
				end
			end
			j = j + 1
		end
	end

	local tiene_espacios = strfind(tit, "%s")
	if args["fono"][1] and args["fono"][1][1] then
		local rim = strsub(args["fono"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
		rim = strsub(rim, "^.*"..primary_stress.."(.-)$", "%1")
		args["rima"] = strsub(rim, ".-".."("..vowel_IPA..".*"..")".."$", "%1")
	end

	return args
end

return export