Módulo:generar-pron/ang

De Wikcionario, el diccionario libre

La documentación para este módulo puede ser creada en Módulo:generar-pron/ang/doc

--[=[

Implementation of pronunciation-generation module from spelling for
Old English.

Author: Benwing
Adaptado por Tmagc

Generally, the user should supply the spelling, properly marked up with
macrons for long vowels, and ċ ġ ċġ sċ for soft versions of these consonants.
In addition, the following symbols can be used:

-- acute accent on a vowel to override the position of primary stress
--   (in a diphthong, put it over the first vowel)
-- grave accent to add secondary stress
-- circumflex to force no stress on the word or prefix (e.g. in a compound)
-- . (period) to force a syllable boundary
-- - (hyphen) to force a prefix/word or word/word boundary in a compound word;
--   the result will be displayed as a single word but the consonants on
--   either side treated as if they occurred at the beginning/end of the word
-- + (plus) is the opposite of -; it forces a prefix/word or word/word boundary
--   to *NOT* occur when it otherwise would
-- _ (underscore) to force the letters on either side to be interpreted
--   independently, when the combination of the two would normally have a
--   special meaning

]=]

local export = {}

local insert = table.insert
local concat = table.concat

local m_table = require("Módulo:tabla")
local m_str = require("Módulo:String")

local u = m_str.char
local strsubb = m_str.gsubb
local strsubn = m_str.gsub
local strsubrep = m_str.gsub_rep
local strstrip = m_str.strip
local strfind = m_str.find
local strmatch = m_str.match
local strsplit = m_str.split
local strlen = m_str.len
local strlower = m_str.lower
local strnfd = m_str.toNFD
local strnfc = m_str.toNFC
local strhtml = m_str.encode_html

-- version of strsubn() that discards all but the first return value
local function strsub(term, foo, bar, n)
	local retval = strsubn(term, foo, bar, n)
	return retval
end

-- like str:gsub() but discards all but the first return value
local function gsub(term, foo, bar, n)
	local retval = term:gsub(foo, bar, n)
	return retval
end

local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹]"

local ACUTE = u(0x0301)
local GRAVE = u(0x0300)
local CFLEX = u(0x0302)
local MACRON = u(0x0304)
local DOTABOVE = u(0x0307)
local SYLLABIC = u(0x0329)
local CEDILLA = u(0x0327)
local DOUBLE_BREVE_BELOW = u(0x035C)

local SUST = 1 -- incluye sustantivos y adjetivos
local VERB = 2
local VERBAL = 3

local recomposer = {
	["g" .. DOTABOVE] = "ġ",
	["G" .. DOTABOVE] = "Ġ",
	["c" .. DOTABOVE] = "ċ",
	["C" .. DOTABOVE] = "Ċ",
	-- used in "explicit allophone" notation in [[Module:ang-pron]]
	["c" .. CEDILLA] = "ç",
	["C" .. CEDILLA] = "Ç",
}

-- Decompose macron, acute, grave, circumflex, but leave alone ġ, ċ and uppercase equiv
local function decompose(text)
	text = strnfd(text)
	text = strsub(text, ".[" .. DOTABOVE .. "]", recomposer)
	return text
end

-- We use the following syllable-splitting algorithm.
-- (1) A single consonant goes with the following syllable.
-- (2) Two consonants are split down the middle.
-- (3) For three or more consonants, check for clusters ending in
--     onsets_3 then onsets_2, with at least one preceding consonant.
--     If so, split between the onset and the preceding consonant(s).
-- (4) Check similarly for secondary_onsets_2. If seen, then check
--     the preceding consonant; if it's not an l or r, split before
--     the onset.
-- (5) Otherwise, split before the last consonant (i.e. the last
--     consonant goes with the following syllable, and all preceding
--     consonants go with the preceding syllable).
local onsets_2 = m_table.listToSet({
	"pr", "pl",
	"br", "bl",
	"tr", "tw",
	"dr", "dw",
	"cr", "cl", "cw", --skip "cn"
	"kr", "kl", "kw", --skip "kn"
	"gr", "gl", -- skip "gn"
	"sm", "sn", "sl", "sw",
	"sp",
	"st",
	"sc", "sk", "sċ",
	"fr", "fl", --skip "fn",
	"þr", "þw",
	"ðr", "ðw",
	"hr", "hl", "hw", -- skip "hn"
	"wr", "wl",
})

local secondary_onsets_2 = m_table.listToSet({
	"cn", "kn",
	"gn",
	"fn",
	"hn",
})

local onsets_3 = m_table.listToSet({
	"spr", "spl",
	"str",
	"scr", "skr", "sċr",
})

local diphthongs = m_table.listToSet({
	"ea", decompose("ēa"), decompose("eā"),
	"eo", decompose("ēo"), decompose("eō"),
	"io", decompose("īo"), decompose("iō"),
	"ie", decompose("īe"), decompose("iē"),
})

local accent_ = MACRON .. ACUTE .. GRAVE .. CFLEX
local prefixes = {
	{decompose("ā"), {verb = "unstressed", noun = "stressed"}},
	{"æt", {verb = "unstressed"}},
	{"æfter", {verb = "secstressed", noun = "stressed"}}, -- not very common
	{"and", {verb = "unstressed", noun = "stressed"}},
	{"an", {verb = "unstressed", noun = "stressed"}},
	{"be", {verb = "unstressed", noun = "unstressed", restriction = "^[^" .. accent_ .. "ao]"}},
	{decompose("bī"), {noun = "stressed"}},
	{"ed", {verb = "unstressed", noun = "stressed"}}, -- not very common
	{"fore", {verb = "unstressed", noun = "stressed", restriction = "^[^" .. accent_ .. "ao]"}},
	{"for[þð]", {verb = "unstressed", noun = "stressed"}},
	{"for", {verb = "unstressed", noun = "unstressed"}},
	{"fram", {verb = "unstressed", noun = "stressed"}}, -- not very common
	-- following is rare as a noun, mostly from verbal forms
	{"ġeond", {verb = "unstressed"}}, 
	{"ġe", {verb = "unstressed", noun = "unstressed", restriction = "^[^" .. accent_ .. "ao]"}},
	{"in", {verb = "unstressed", noun = "stressed"}}, -- not very common
	{"mis", {verb = "unstressed"}},
	{"ofer", {verb = "secstressed", noun = "stressed"}},
	{"of", {verb = "unstressed", noun = "stressed"}},
	{"on", {verb = "unstressed", noun = "stressed"}},
	{"or", {noun = "stressed"}},
	{"o[þð]", {verb = "unstressed"}},
	{decompose("stēop"), {noun = "stressed"}},
	{decompose("tō"), {verb = "unstressed", noun = "stressed"}},
	{"under", {verb = "secstressed", noun = "stressed"}},
	{"un", {verb = "unstressed", noun = "stressed", verbal = "stressed"}}, -- uncommon as verb
	{"up", {verb = "unstressed", noun = "stressed"}},
	{decompose("ūt"), {verb = "unstressed", noun = "stressed"}},
	{decompose("ū[þð]"), {noun = "stressed"}},
	{"[wƿ]i[þð]er", {verb = "secstressed", noun = "stressed"}},
	{"[wƿ]i[þð]", {verb = "unstressed"}},
	{"ymb", {verb = "unstressed", noun = "stressed"}},
	{"[þð]urh", {verb = "unstressed", noun = "stressed"}},
}

local suffixes = {
	{decompose("bǣre"), {noun = "secstressed"}},
	{"fæst", {noun = "secstressed"}},
	{"feald", {noun = "secstressed"}},
	{"full?", {noun = "unstressed"}},
	{decompose("lēas"), {noun = "secstressed"}},
	-- These can be VERBAL if following a verbal past participle or similar
	{decompose("līċe"), {noun = "secstressed", verb = "secstressed"}},
	-- ī is decomposed into two chars so can't combine into [īi]
	{decompose("li[ċc]"), {noun = "unstressed", verb = "unstressed"}},
	{decompose("lī[ċc]"), {noun = "unstressed", verb = "unstressed"}},
	{"n[eiy]ss?", {noun = "unstressed", verb = "unstressed"}},
	{"sum", {noun = "unstressed"}},
}

-- When auto-generating primary and secondary stress accents, we use these
-- special characters, and later convert to normal IPA accent marks, so
-- we can distinguish auto-generated stress from user-specified stress.
local AUTOACUTE = u(0xFFF0)
local AUTOGRAVE = u(0xFFF1)

-- When the user uses the "explicit allophone" notation such as [z] or [ç] to
-- force a particular allophone, we internally convert that notation into a
-- single special character.
local EXPLICIT_TH = u(0xFFF2)
local EXPLICIT_DH = u(0xFFF3)
local EXPLICIT_S = u(0xFFF4)
local EXPLICIT_Z = u(0xFFF5)
local EXPLICIT_F = u(0xFFF6)
local EXPLICIT_V = u(0xFFF7)
local EXPLICIT_G = u(0xFFF8)
local EXPLICIT_GH = u(0xFFF9)
local EXPLICIT_H = u(0xFFFA)
local EXPLICIT_X = u(0xFFFB)
local EXPLICIT_C = u(0xFFFC)
local EXPLICIT_I = u(0xFFFD)

local explicit_cons = EXPLICIT_TH .. EXPLICIT_DH .. EXPLICIT_S .. EXPLICIT_Z ..
	EXPLICIT_F .. EXPLICIT_V .. EXPLICIT_G .. EXPLICIT_GH .. EXPLICIT_H ..
	EXPLICIT_X .. EXPLICIT_C

-- Map "explicit allophone" notation into special char. See above.
local char_to_explicit_char = {
	["þ"] = EXPLICIT_TH,
	["ð"] = EXPLICIT_DH,
	["s"] = EXPLICIT_S,
	["z"] = EXPLICIT_Z,
	["f"] = EXPLICIT_F,
	["v"] = EXPLICIT_V,
	["g"] = EXPLICIT_G,
	["ɣ"] = EXPLICIT_GH,
	["h"] = EXPLICIT_H,
	["x"] = EXPLICIT_X,
	["ç"] = EXPLICIT_C,
	["i"] = EXPLICIT_I,
}

-- Map "explicit allophone" notation into normal spelling, for supporting ann=.
local char_to_spelling = {
	["þ"] = "þ",
	["ð"] = "þ",
	["s"] = "s",
	["z"] = "s",
	["f"] = "f",
	["v"] = "f",
	["g"] = "g",
	["ɣ"] = "g",
	["h"] = "h",
	["x"] = "h",
	["ç"] = "h",
	["i"] = "i",
}

-- Map "explicit allophone" notation into phonemes, for phonemic output.
local explicit_char_to_phonemic = {
	[EXPLICIT_TH] = "θ",
	[EXPLICIT_DH] = "θ",
	[EXPLICIT_S] = "s",
	[EXPLICIT_Z] = "s",
	[EXPLICIT_F] = "f",
	[EXPLICIT_V] = "f",
	[EXPLICIT_G] = "ɡ", -- IPA ɡ!
	[EXPLICIT_GH] = "ɡ", -- IPA ɡ!
	[EXPLICIT_H] = "x",
	[EXPLICIT_X] = "x",
	[EXPLICIT_C] = "x",
	[EXPLICIT_I] = "i",
}

-- Map "explicit allophone" notation into IPA phones, for phonetic output.
local explicit_char_to_phonetic = {
	[EXPLICIT_TH] = "θ",
	[EXPLICIT_DH] = "ð",
	[EXPLICIT_S] = "s",
	[EXPLICIT_Z] = "z",
	[EXPLICIT_F] = "f",
	[EXPLICIT_V] = "v",
	[EXPLICIT_G] = "ɡ", -- IPA ɡ!
	[EXPLICIT_GH] = "ɣ",
	[EXPLICIT_H] = "h",
	[EXPLICIT_X] = "x",
	[EXPLICIT_C] = "ç",
	[EXPLICIT_I] = "i",
}

local accent = MACRON .. ACUTE .. GRAVE .. CFLEX .. AUTOACUTE .. AUTOGRAVE
local accent_c = "[" .. accent .. "]"
local stress_accent = ACUTE .. GRAVE .. CFLEX .. AUTOACUTE .. AUTOGRAVE
local stress_accent_c = "[" .. stress_accent .. "]"
local back_vowel = "aɑou"
local front_vowel = "eiyæœø" .. EXPLICIT_I
local vowel = back_vowel .. front_vowel
local vowel_or_accent = vowel .. accent
local vowel_c = "[" .. vowel .. "]"
local vowel_or_accent_c = "[" .. vowel_or_accent .. "]"
local non_vowel_c = "[^" .. vowel .. "]"
local front_vowel_c = "[" .. front_vowel .. "]"
-- The following include both IPA symbols and letters (including regular g and IPA ɡ)
-- so it can be used at any step of the process.
local obstruent = "bcċçdfgɡɣhkpqstvxzþðθʃʒ" .. explicit_cons
local resonant = "lmnŋrɫ"
local glide = "ġjwƿ"
local cons = obstruent .. resonant .. glide
local cons_c = "[" .. cons .. "]"
local voiced_sound = vowel .. "lrmnwjbdɡ" -- WARNING, IPA ɡ used here

-- These rules operate in order, and apply to the actual spelling,
-- after (1) macron decomposition, (2) syllable and prefix splitting,
-- (3) placement of primary and secondary stresses at the beginning
-- of the syllable. Each syllable will be separated either by ˈ
-- (if the following syllable is stressed), by ˌ (if the following
-- syllable has secondary stress), or by . (otherwise). In addition,
-- morpheme boundaries where the consonants on either side should be
-- treated as at the beginning/end of word (i.e. between prefix and
-- word, or between words in a compound word) will be marked with ⁀
-- before the syllable separator, and the beginning and end of text
-- will be marked by ⁀⁀. The output of this is fed into phonetic_rules,
-- and then is used to generate the displayed phonemic pronunciation
-- by removing ⁀ symbols.
local phonemic_rules = {
	{MACRON, "ː"},
	{"eoː", "oː"}, -- e.g. ġeōmor
	{"eaː", "aː"},
	{"[ei]ː?[aeo]", {
		-- Alternative notation for short diphthongs: iu̯, eo̯, æɑ̯
		-- Alternative notation for long diphthongs: iːu̯, eːo̯, æːɑ̯
		["ea"] = "æ͜ɑ",
		["eːa"] = "æ͜ɑː",
		["eo"] = "e͜o",
		["eːo"] = "e͜oː",
		["io"] = "i͜u",
		["iːo"] = "i͜uː",
		["ie"] = "i͜y",
		["iːe"] = "i͜yː",
	}},
	-- sċ between vowels when at the beginning of a syllable should be ʃ.ʃ
	{"(" .. vowel_c .. "ː?)([.ˈˌ]?)sċ(" .. vowel_c .. ")", "%1ʃ%2ʃ%3"},
	-- other sċ should be ʃ; note that sċ divided between syllables becomes s.t͡ʃ
	{"sċ", "ʃ"},
	-- x between vowels when at the beginning of a syllable should be k.s;
	-- remaining x handled below
	{"(" .. vowel_c .. "ː?)([.ˈˌ]?)x(" .. vowel_c .. ")", "%1k%2s%3"},
	-- z between vowels when at the beginning of a syllable should be t.s;
	-- remaining z handled below
	{"(" .. vowel_c .. "ː?)([.ˈˌ]?)z(" .. vowel_c .. ")", "%1t%2s%3"},
	-- short front vowel + -rian, -riend, -rienne, -riende in verb or verbal is
	-- rendered with /j/; we need to carefully change the syllable structure
	-- when doing this
	{"(" .. front_vowel_c .. ")%.ri%.(an⁀)", "%1r.ġ%2", {VERB}},
	{"(" .. front_vowel_c .. ")%.ri%.(end⁀)", "%1r.ġ%2", {VERB, VERBAL}},
	{"(" .. front_vowel_c .. ")%.ri%.(en%.[nd]e⁀)", "%1r.ġ%2", {VERB, VERBAL}},
	{"nċ([.ˈˌ]?)ġ", "n%1j"},
	{"ċ([.ˈˌ]?)ġ", "j%1j"},
	{"c([.ˈˌ]?)g", "g%1g"},
	{"ċ([.ˈˌ]?)ċ", "t%1t͡ʃ"},
	{".", {
		["ċ"] = "t͡ʃ",
		["c"] = "k",
		["ġ"] = "j",
		["h"] = "x",
		["þ"] = "θ",
		["ð"] = "θ",
		["ƿ"] = "w",
		["x"] = "ks",
		["z"] = "ts",
		["g"] = "ɡ", -- map to IPA ɡ
		["a"] = "ɑ",
		["œ"] = "ø",
	}},
}

local fricative_to_voiced = {
	["f"] = "v",
	["s"] = "z",
	["θ"] = "ð",
}

local fricative_to_unvoiced = {
	["v"] = "f",
	["z"] = "s",
	["ð"] = "θ",
}

-- These rules operate in order, on the output of phonemic_rules.
-- The output of this is used to generate the displayed phonemic
-- pronunciation by removing ⁀ symbols.
local phonetic_rules = {
	-- Fricative voicing between voiced sounds. Note, the following operates
	-- across a ⁀ boundary for a fricative before the boundary but not after.
	{"([" .. voiced_sound .. "][ː.ˈˌ]*)([fsθ])([ː.ˈˌ⁀]*[" .. voiced_sound .. "])",
		function(s1, c, s2)
			return s1 .. fricative_to_voiced[c] .. s2
		end
	},
	-- Fricative between unstressed vowels should be devoiced.
	-- Note that unstressed syllables are preceded by . while stressed
	-- syllables are preceded by a stress mark.
	{"(%.[^.⁀][" .. vowel .. DOUBLE_BREVE_BELOW .. "ː]*%.)([vzð])",
		function(s1, c)
			return s1 .. fricative_to_unvoiced[c]
		end
	},
	-- Final -sian, -siend, -sienne, -siende (and variants such as -siġan,
	-- -siġend, etc.) in verb or verbal is rendered with [s]; clǣnsian will
	-- have to be special-cased with ''[z]''
	{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?ɑn⁀)", "%1s%2", {VERB}},
	{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?end⁀)", "%1s%2", {VERB, VERBAL}},
	{"(" .. cons_c .. "ː?" .. "%.)z(i%.j?en%.[nd]e⁀)", "%1s%2", {VERB, VERBAL}},
	-- Final unstressed -þu/-þo after a consonant should be devoiced.
	{"(" .. cons_c .. "ː?" .. "%.)ð([uo]⁀)",
		function(s1, s2)
			return s1 .. "θ" .. s2
		end
	},
	{"x[wnlr]", {
		["xw"] = "ʍ",
		["xl"] = "l̥",
		["xn"] = "n̥",
		["xr"] = "r̥",
	}},
	-- Note, the following will not operate across a ⁀ boundary.
	{"n([.ˈˌ]?[ɡk])", "ŋ%1"}, -- WARNING, IPA ɡ used here
	{"n([.ˈˌ]?)j", "n%1d͡ʒ"},
	{"j([.ˈˌ]?)j", "d%1d͡ʒ"},
	{"([^x][⁀.ˈˌ])x", "%1h"},      -- [h] occurs as a syllable-initial allophone
	{"(" .. front_vowel_c .. ")x", "%1ç"}, -- [ç] occurs after front vowels
	-- An IPA ɡ after a word/prefix boundary, after another ɡ or after n
	-- (previously converted to ŋ in this circumstance) should remain as ɡ,
	-- while all other ɡ's should be converted to ɣ except that word-final ɡ
	-- becomes x. We do this by converting the ɡ's that should remain to regular
	-- g (which should never occur otherwise), convert the remaining IPA ɡ's to ɣ
	-- or x, and then convert the regular g's back to IPA ɡ.
	{"ɡ([.ˈˌ]?)ɡ", "g%1g"}, -- WARNING, IPA ɡ on the left, regular g on the right
	{"([ŋ⁀])([.ˈˌ]?)ɡ", "%1%2g"}, -- WARNING, IPA ɡ on the left, regular g on the right 
	{"ɡ", "ɣ"},
	{"g", "ɡ"}, -- WARNING, regular g on the left, IPA ɡ on the right
	{"l([.ˈˌ]?)l", "ɫ%1ɫ"},
	{"r([.ˈˌ]?)r", "rˠ%1rˠ"},
	{"l([.ˈˌ]?" .. cons_c .. ")", "ɫ%1"},
	{"r([.ˈˌ]?" .. cons_c .. ")", "rˠ%1"},
	-- Geminate consonants within a single syllable are pronounced singly.
	-- Does not apply e.g. to ''ǣttren'', which will be divided as ''ǣt.tren''.
	{"(" .. cons_c .. ")%1", "%1"},
	{"rˠrˠ", "rˠ"},
	-- [In the sequence vowel + obstruent + resonant in a single syllable,
	-- the resonant should become syllabic, e.g. ādl [ˈɑːdl̩], blōstm [bloːstm̩],
	-- fæþm [fæðm̩], bēacn [ˈbæːɑ̯kn̩]. We allow anything but a syllable or word
	-- boundary betweent the vowel and the obstruent.] [BASED ON INPUT FROM
	-- [[User:Urszag]], I'VE DECIDE AGAINST THIS]
	-- {"(" .. vowel_c .. "[^.ˈˌ⁀]*[" .. obstruent .. "]ː?[" .. resonant .. "])", "%1" .. SYLLABIC},
	-- also -mn e.g stemn /ˈstemn̩/; same for m + other resonants except m
	-- {"(" .. vowel_c .. "[^.ˈˌ⁀]*mː?[lnŋrɫ])", "%1" .. SYLLABIC},
	{".", explicit_char_to_phonetic},
}

local pron_abc = {
    ["A"] = {"a"},
	["B"] = {"bee"},
	["C"] = {"cee"},
	["D"] = {"dee"},
	["E"] = {"e"},
	["F"] = {"eff"},
	["G"] = {"gee"},
	["H"] = {"aitch"},
	["I"] = {"i"},
	["J"] = {"jay"},
	["K"] = {"kay"},
	["L"] = {"el"},
	["M"] = {"em"},
	["N"] = {"en"},
	["O"] = {"o"},
	["P"] = {"pee"},
	["Q"] = {"cue"},
	["R"] = {"ar"},
	["S"] = {"ess"},
	["T"] = {"tee"},
	["U"] = {"u"},
	["V"] = {"vee"},
	["W"] = {"double-u"},
	["X"] = {"ex"},
	["Y"] = {"wye"},
	["Z"] = {"zed"},
    ["a"] = {"a"},
	["b"] = {"bee"},
	["c"] = {"cee"},
	["d"] = {"dee"},
	["e"] = {"e"},
	["f"] = {"eff"},
	["g"] = {"gee"},
	["h"] = {"aitch"},
	["i"] = {"i"},
	["j"] = {"jay"},
	["k"] = {"kay"},
	["l"] = {"el"},
	["m"] = {"em"},
	["n"] = {"en"},
	["o"] = {"o"},
	["p"] = {"pee"},
	["q"] = {"cue"},
	["r"] = {"ar"},
	["s"] = {"ess"},
	["t"] = {"tee"},
	["u"] = {"u"},
	["v"] = {"vee"},
	["w"] = {"double-u"},
	["x"] = {"ex"},
	["y"] = {"wye"},
	["z"] = {"zed"},
	["ᚠ"] = {"feoh"},
	["ᚢ"] = {"ūr"},
	["ᚦ"] = {"þorn"},
	["ᚩ"] = {"ōs"},
	["ᚱ"] = {"rād"},
	["ᚳ"] = {"cēn"},
	["ᚷ"] = {"gyfu"},
	["ᚹ"] = {"wynn"},
	["ᚻ"] = {"hægl"},
	["ᚾ"] = {"nēod"},
	["ᛁ"] = {"īs"},
	["ᛡ"] = {"gēar"},
	["ᛄ"] = {"gēar"},
	["ᛇ"] = {"īw"},
	["ᛈ"] = {"peorð"},
	["ᛉ"] = {"ilcs"},
	["ᛋ"] = {"sigel"},
	["ᚴ"] = {"sigel"},
	["ᛏ"] = {"Tīw"},
	["ᛒ"] = {"beorc"},
	["ᛖ"] = {"eh"},
	["ᛗ"] = {"mann"},
	["ᛚ"] = {"lagu"},
	["ᛝ"] = {"ing"},
	["ᛟ"] = {"ēðel"},
	["ᛞ"] = {"dæg"},
	["ᚪ"] = {"āc"},
	["ᚫ"] = {"æsc"},
	["ᛠ"] = {"ēar"},
	["ᚣ"] = {"ȳr"},
}

local function normalizar(texto)
	local is_prefix, is_suffix
	texto = strlower(texto)
	--t = strsub(t, "%[(.)%]", char_to_explicit_char)
	texto = strsubrep(texto, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
	texto = strsubrep(texto, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado	t, is_suffix = strsubb(t, "^%-", "")
	texto, is_prefix = strsubb(texto, "[%-‐]$", "")
	texto, is_sufix = strsubb(texto, "^[%-‐]", "")
	-- texto = strsubrep(texto, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)

    texto = strsubrep(texto, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
    texto = strsubrep(texto, "%s+", " ")
	texto = strstrip(texto, "[%s|]+")
	
	return texto, is_prefix, is_sufix
end

local function apply_rules(word, rules, pos)
	for _, rule in ipairs(rules) do
		local allowed_pos = rule[3]
		if not allowed_pos or m_table.contains(allowed_pos, pos) then
			word = strsub(word, rule[1], rule[2])
		end
	end
	return word
end

local function lookup_stress_spec(stress_spec, pos)
	return stress_spec[pos] or (pos == VERBAL and stress_spec[VERB]) or nil
end

local function split_on_word_boundaries(word, pos)
	local retparts = {}
	local parts = strsplit(word, "([<>%-])")
	local i = 1
	local saw_primary_stress = false
	while i <= #parts do
		local split_part = false
		local insert_position = #retparts + 1
		if parts[i + 1] ~= "<" and parts[i - 1] ~= ">" then
			-- Split off any prefixes.
			while true do
				local broke_prefix = false
				for _, prefixspec in ipairs(prefixes) do
					local prefix_pattern = prefixspec[1]
					local stress_spec = prefixspec[2]
					local pos_stress = lookup_stress_spec(stress_spec, pos)
					local prefix, rest = strmatch(parts[i], "^(" .. prefix_pattern .. ")(.*)$")
					if prefix then
						if not pos_stress then
							-- prefix not recognized for this POS, don't split here
						elseif stress_spec.restriction and not strfind(rest, stress_spec.restriction) then
							-- restriction not met, don't split here
						elseif strfind(rest, "^%+") then
							-- explicit non-boundary here, so don't split here
						elseif not strfind(rest, vowel_c) then
							-- no vowels, don't split here
						elseif strfind(rest, "^..?$") then
							-- only two letters, unlikely to be a word, probably an ending, so don't split
							-- here
						else
							local initial_cluster, after_cluster = strmatch(rest, "^(" .. non_vowel_c .. "*)(.-)$")
							if strfind(initial_cluster, "..") and (
								not (onsets_2[initial_cluster] or secondary_onsets_2[initial_cluster] or
									onsets_3[initial_cluster])) then
								-- initial cluster isn't a possible onset, don't split here
							elseif strfind(initial_cluster, "^x") then
								-- initial cluster isn't a possible onset, don't split here
							elseif strfind(after_cluster, "^" .. vowel_c .. "$") then
								-- remainder is a cluster + short vowel,
								-- unlikely to be a word so don't split here
							else
								-- break the word in two; next iteration we process
								-- the rest, which may need breaking again
								parts[i] = rest
								if pos_stress == "unstressed" then
									-- don't do anything
								elseif pos_stress == "secstressed" or (saw_primary_stress and pos_stress == "stressed") then
									prefix = strsub(prefix, "(" .. vowel_c .. ")", "%1" .. AUTOGRAVE, 1)
								elseif pos_stress == "stressed" then
									prefix = strsub(prefix, "(" .. vowel_c .. ")", "%1" .. AUTOACUTE, 1)
									saw_primary_stress = true
								else
									error("Unrecognized stress spec for pos=" .. pos .. ", prefix=" .. prefix .. ": " .. pos_stress)
								end
								insert(retparts, insert_position, prefix)
								insert_position = insert_position + 1
								broke_prefix = true
								break
							end
						end
					end
				end
				if not broke_prefix then
					break
				end
			end

			-- Now do the same for suffixes.
			while true do
				local broke_suffix = false
				for _, suffixspec in ipairs(suffixes) do
					local suffix_pattern = suffixspec[1]
					local stress_spec = suffixspec[2]
					local pos_stress = lookup_stress_spec(stress_spec, pos)
					local rest, suffix = strmatch(parts[i], "^(.-)(" .. suffix_pattern .. ")$")
					if suffix then
						if not pos_stress then
							-- suffix not recognized for this POS, don't split here
						elseif stress_spec.restriction and not strfind(rest, stress_spec.restriction) then
							-- restriction not met, don't split here
						elseif strfind(rest, "%+$") then
							-- explicit non-boundary here, so don't split here
						elseif not strfind(rest, vowel_c) then
							-- no vowels, don't split here
						else
							local before_cluster, final_cluster = strmatch(rest, "^(.-)(" .. non_vowel_c .. "*)$")
							if strfind(final_cluster, "%..") then
								-- syllable division within or before final
								-- cluster, don't split here
							else
								-- break the word in two; next iteration we process
								-- the rest, which may need breaking again
								parts[i] = rest
								if pos_stress == "unstressed" then
									-- don't do anything
								elseif pos_stress == "secstressed" then
									suffix = strsub(suffix, "(" .. vowel_c .. ")", "%1" .. AUTOGRAVE, 1)
								elseif pos_stress == "stressed" then
									error("Primary stress not allowed for suffixes (suffix=" .. suffix .. ")")
								else
									error("Unrecognized stress spec for pos=" .. pos .. ", suffix=" .. suffix .. ": " .. pos_stress)
								end
								insert(retparts, insert_position, suffix)
								broke_suffix = true
								break
							end
						end
					end
				end
				if not broke_suffix then
					break
				end
			end
		end

		local acc = strfind(parts[i], "(" .. stress_accent_c .. ")")
		if acc == CFLEX then
			-- remove circumflex but don't accent
			parts[i] = gsub(parts[i], CFLEX, "")
		elseif acc == ACUTE or acc == AUTOACUTE then
			saw_primary_stress = true
		elseif not acc and parts[i + 1] ~= "<" and parts[i - 1] ~= ">" then
			-- Add primary or secondary stress on the part; primary stress if no primary
			-- stress yet, otherwise secondary stress.
			acc = saw_primary_stress and AUTOGRAVE or AUTOACUTE
			saw_primary_stress = true
			parts[i] = strsub(parts[i], "(" .. vowel_c .. ")", "%1" .. acc, 1)
		end
		insert(retparts, insert_position, parts[i])
		i = i + 2
	end

	-- remove any +, which has served its purpose
	for _, part in ipairs(retparts) do
		retparts[i] = gsub(part, "%+", "")
	end
	return retparts
end

local function break_vowels(vowelseq)
	local function check_empty(char)
		if char ~= "" then
			error("Something wrong, non-vowel '" .. char .. "' seen in vowel sequence '" .. vowelseq .. "'")
		end
	end

	local vowels = {}
	local chars = strsplit(vowelseq, "(" .. vowel_c .. accent_c .. "*)")
	local i = 1
	while i <= #chars do
		if i % 2 == 1 then
			check_empty(chars[i])
			i = i + 1
		else
			if i < #chars - 1 and diphthongs[
				strsub(chars[i], stress_accent_c, "") .. strsub(chars[i + 2], stress_accent_c, "")
			] then
				check_empty(chars[i + 1])
				insert(vowels, chars[i] .. chars[i + 2])
				i = i + 3
			else
				insert(vowels, chars[i])
				i = i + 1
			end
		end
	end
	return vowels
end

-- Break a word into alternating C and V components where a C component is a run
-- of zero or more consonants and a V component in a single vowel or dipthong.
-- There will always be an odd number of components, where all odd-numbered
-- components (starting from 1) are C components and all even-numbered components
-- are V components.
local function break_into_c_and_v_components(word)
	local cons_vowel = strsplit(word, "(" .. vowel_or_accent_c .. "+)")
	local components = {}
	for i = 1, #cons_vowel do
		if i % 2 == 1 then
			insert(components, cons_vowel[i])
		else
			local vowels = break_vowels(cons_vowel[i])
			for j = 1, #vowels do
				if j == 1 then
					insert(components, vowels[j])
				else
					insert(components, "")
					insert(components, vowels[j])
				end
			end
		end
	end
	return components
end

local function split_into_syllables(word)
	local cons_vowel = break_into_c_and_v_components(word)
	if #cons_vowel == 1 then
		return cons_vowel
	end
	for i = 1, #cons_vowel do
		if i % 2 == 1 then
			-- consonant
			local cluster = cons_vowel[i]
			local len = strlen(cluster)
			if i == 1 then
				cons_vowel[i + 1] = cluster .. cons_vowel[i + 1]
			elseif i == #cons_vowel then
				cons_vowel[i - 1] = cons_vowel[i - 1] .. cluster
			elseif strfind(cluster, "%.") then
				local before_break, after_break = strmatch(cluster, "^(.-)%.(.*)$")
				cons_vowel[i - 1] = cons_vowel[i - 1] .. before_break
				cons_vowel[i + 1] = after_break .. cons_vowel[i + 1]
			elseif len == 0 then
				-- do nothing
			elseif len == 1 then
				cons_vowel[i + 1] = cluster .. cons_vowel[i + 1]
			elseif len == 2 then
				local c1, c2 = strmatch(cluster, "^(.)(.)$")
				if c1 == "s" and c2 == "ċ" then
					cons_vowel[i + 1] = "sċ" .. cons_vowel[i + 1]
				else
					cons_vowel[i - 1] = cons_vowel[i - 1] .. c1
					cons_vowel[i + 1] = c2 .. cons_vowel[i + 1]
				end
			else
				-- check for onset_3 preceded by consonant(s).
				local first3, last3 = strmatch(cluster, "^(.-)(...)$")
				if #first3 > 0 and onsets_3[last3] then
					cons_vowel[i - 1] = cons_vowel[i - 1] .. first3
					cons_vowel[i + 1] = last3 .. cons_vowel[i + 1]
				else
					local first2, last2 = strmatch(cluster, "^(.-)(..)$")
					if onsets_2[last2] or (secondary_onsets_2[last2] and not first2:find("[lr]$")) then
						cons_vowel[i - 1] = cons_vowel[i - 1] .. first2
						cons_vowel[i + 1] = last2 .. cons_vowel[i + 1]
					else
						local first, last = strmatch(cluster, "^(.-)(.)$")
						cons_vowel[i - 1] = cons_vowel[i - 1] .. first
						cons_vowel[i + 1] = last .. cons_vowel[i + 1]
					end
				end
			end
		end
	end

	local retval = {}
	for i = 1, #cons_vowel do
		if i % 2 == 0 then
			-- remove any stray periods.
			insert(retval, strsub(cons_vowel[i], "%.", ""))
		end
	end
	return retval
end

-- Combine syllables into a word, moving stress markers (acute/grave) to the
-- beginning of the syllable.
local function combine_syllables_moving_stress(syllables, no_auto_stress)
	local modified_syls = {}
	for i, syl in ipairs(syllables) do
		if syl:find(ACUTE) or syl:find(AUTOACUTE) and not no_auto_stress then
			syl = "ˈ" .. syl
		elseif syl:find(GRAVE) or syl:find(AUTOGRAVE) and not no_auto_stress then
			syl = "ˌ" .. syl
		elseif i > 1 then
			syl = "." .. syl
		end
		syl = strsub(syl, stress_accent_c, "")
		insert(modified_syls, syl)
	end
	return concat(modified_syls)
end

-- Combine word parts (split-off prefixes, suffixes or parts of a compound word)
-- into a single word. Separate parts with ⁀ and the put ⁀⁀ at word boundaries.
local function combine_parts(parts)
	local text = {}
	for i, part in ipairs(parts) do
		if i > 1 and not strfind(part, "^[ˈˌ]") then
			-- Need a syllable boundary if there isn't a stress marker.
			insert(text, "." .. part)
		else
			insert(text, part)
		end
	end
	return "⁀⁀" .. concat(text, "⁀") .. "⁀⁀"
end

local function transform_word(word, pos, no_auto_stress)
	word = decompose(word)
	local parts = split_on_word_boundaries(word, pos)
	for i, part in ipairs(parts) do
		local syllables = split_into_syllables(part)
		parts[i] = combine_syllables_moving_stress(syllables,
			no_auto_stress or (#parts == 1 and #syllables == 1))
	end
	return combine_parts(parts)
end

local function default_pos(word)
	-- verbs in -an/-ōn/-ēon, inflected infinitives in -enne
	if strfind(word, "[aāō]n$") or strfind(word, "ēon$") or strfind(word, "enne$") then
		return VERB
	end
	
	-- adjectives in -līċ, adverbs in -līċe and nouns in -nes can follow
	-- nouns or participles (which are VERBAL); truncate the ending
	-- and check what precedes
	word = strsub(word, "^(.*" .. vowel_c .. ".*)l[iī][cċ]e?$", "%1")
	word = strsub(word, "^(.*" .. vowel_c .. ".*)n[eiy]ss?$", "%1")
	-- participles in -end(e)/-en/-ed/-od, verbal nouns in -ing/-ung
	if strfind(word, "ende?$") or strfind(word, "[eo]d$") or strfind(word, "en$")
		or strfind(word, "[iu]ng$") then
		return VERBAL
	end
	return SUST
end

local function generar_pron(t1, fone, cg)
	local t, is_prefix, is_suffix = normalizar(t1)
	local convertido = {}
	local fragmentos = strsplit(t, "%s*|%s*")
	local k = 1
	
	for _,fragmento in ipairs(fragmentos) do
	    local palabras = strsplit(fragmento, "%s")
	    local is_prefix_suffix = (is_prefix or is_suffix) and #palabras == 1
		local palabras_convertidas = {}
	    for _,p in ipairs(palabras) do
			local pos = cg[k] or default_pos(p)
			p = transform_word(p, pos, is_prefix_suffix)
			p = apply_rules(p, phonemic_rules, pos)
			if fone then
				p = apply_rules(p, phonetic_rules, pos)
			end
	        insert(palabras_convertidas, p)
	        k = k + 1
	    end
	    insert(convertido, concat(palabras_convertidas, " "))
	end
	
	local result = concat(convertido, " | ")
	if fone then
		result = strsub(result, ".", explicit_char_to_phonetic)
	else
		result = strsub(result, ".", explicit_char_to_phonemic)
	end
	result = strsub(result, "⁀", "")
	result = strhtml(result)
	
    return {{result}}
end

function export.procesar_pron_args(titulo, args)
	local tit = titulo
	local vino_ayuda, x

	if #args["ayuda"] < 1 then
		args["ayuda"][1] = tit
	else
		vino_ayuda = true
	end

	if #args["fone"] < 1 and #args["fono"] < 1 then
		x = pron_abc[args["ayuda"][1]]
		if x then
			args["ayuda"] = x
			args["tl"] = x
		end

		local A = #args["ayuda"]
		local j = 1 -- indice de la ayuda
		local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
		while k <= 9 and j <= A do
			local cg = {}
			local flags = args["ayudaextra"][j] and strsplit(args["ayudaextra"][j], ";") or {}
			for _,flag in ipairs(flags) do
				if flag == "s" or flag == "sust" or flag == "sustantivo" or flag == "a" or flag == "adj" or flag == "adjetivo" then
					insert(cg, SUST)
				elseif flag == "v" or flag == "verb" or flag == "verbo" then
					insert(cg, VERB)
				elseif flag == "l" or flag == "verbal" then
					insert(cg, VERBAL)
				end
			end
			if vino_ayuda then
				args["fgraf"][j] = {args["ayuda"][j]}
			end
			local fone = generar_pron(args["ayuda"][j], true, cg)
			for i,_ in ipairs(fone) do
				insert(args["fone"], fone[i])
				k = k + 1
				if k > 9 then
					break
				end
			end
			j = j + 1
		end
	end

	local tiene_espacios = strfind(tit, "%s")
	if args["fone"][1] and args["fone"][1][1] then
		local rim = strsub(args["fone"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
		rim = strsub(rim, "^.*ˈ(.-)$", "%1")
		args["rima"] = strsub(rim, ".-".."(["..vowel.."].*"..")".."$", "%1")
	end

	return args
end


return export