Módulo:String/sustituir

De Wikcionario, el diccionario libre

La documentación para este módulo puede ser creada en Módulo:String/sustituir/doc

local insert = table.insert

local patterns_ = {
	"((</?link>))\0", -- Special link formatting added by [[Module:links]]
	"((<[^<>\1\2]+>))", -- HTML tag
	"((\1[Ff][Ii][Ll][Ee]:[^\1\2]+\2))\0", -- File
	"((\1[Ii][Mm][Aa][Gg][Ee]:[^\1\2]+\2))\0", -- Image
	"((\1[Cc][Aa][Tt][Ee][Gg][Oo][Rr][Yy]:[^\1\2]+\2))\0", -- Category
	"((\1[Cc][Aa][Tt]:[^\1\2]+\2))\0", -- Category
	"((\1)[^\1\2|]+(\2))\0", -- Bare internal link
	"((\1)[^\1\2|]-(|)[^\1\2]-(\2))\0", -- Piped internal link
	"((%[https?://[^[%] ]+)[^[%]]*(%]))\0", -- External link
	"((\127'\"`UNIQ%-%-%l+%-%x+%-+QINU`\"'\127))", -- Strip marker
	"('*(''').-'*('''))", -- Bold
	"('*('').-'*(''))" -- Italics
}

local function table_icopy(t)
  local t2 = {}
  for i,v in ipairs(t) do
    t2[i] = v
  end
  return t2
end

local escapar = require("Módulo:String/escapar")

-- Temporarily convert various formatting characters to PUA to prevent them from being disrupted by the substitution process.
local function doTempSubstitutions(text, subbedChars, keepCarets, noTrim)
	local patterns = table_icopy(patterns_)
	if keepCarets then
		insert(patterns, "((\\+)%^)")
		insert(patterns, "((%^))")
	end
	-- Ensure any whitespace at the beginning and end is temp substituted, to prevent it from being accidentally trimmed. We only want to trim any final spaces added during the substitution process (e.g. by a module), which means we only do this during the first round of temp substitutions.
	if not noTrim then
		insert(patterns, "^([\128-\191\244]*(%s+))")
		insert(patterns, "((%s+)[\128-\191\244]*)$")
	end
	-- Pre-substitution, of "[[" and "]]", which makes pattern matching more accurate.
	text = text
		:gsub("%f[%[]%[%[", "\1")
		:gsub("%f[%]]%]%]", "\2")
	local i = #subbedChars
	for j, pattern in ipairs(patterns) do
		-- Patterns ending in \0 stand are for things like "[[" or "]]"), so the inserted PUA are treated as breaks between terms by modules that scrape info from pages.
		local term_divider
		pattern = pattern:gsub("%z$", function(divider)
			term_divider = divider == "\0"
			return ""
		end)
		text = text:gsub(pattern, function(...)
			local m = {...}
			local m1New = m[1]
			for k = 2, #m do
				local n = i + k - 1
				subbedChars[n] = m[k]
				local byte2 = math.floor(n / 4096) % 64 + (term_divider and 128 or 136)
				local byte3 = math.floor(n / 64) % 64 + 128
				local byte4 = n % 64 + 128
				m1New = m1New:gsub(escapar(m[k]), "\244" .. string.char(byte2) .. string.char(byte3) .. string.char(byte4), 1)
			end
			i = i + #m - 1
			return m1New
		end)
	end
	text = text
		:gsub("\1", "%[%[")
		:gsub("\2", "%]%]")
	return text, subbedChars
end

-- Reinsert any formatting that was temporarily substituted.
local function undoTempSubstitutions(text, subbedChars)
	for i = 1, #subbedChars do
		local byte2 = math.floor(i / 4096) % 64 + 128
		local byte3 = math.floor(i / 64) % 64 + 128
		local byte4 = i % 64 + 128
		text = text:gsub("\244[" .. string.char(byte2) .. string.char(byte2+8) .. "]" .. string.char(byte3) .. string.char(byte4), escapar(subbedChars[i]))
	end
	text = text
		:gsub("\1", "%[%[")
		:gsub("\2", "%]%]")
	return text
end

-- Split the text into sections, based on the presence of temporarily substituted formatting characters, then iterate over each one to apply substitutions. This avoids putting PUA characters through language-specific modules, which may be unequipped for them.
return function (text, subbedChars, keepCarets, cod, idioma, sc, substitution_data, function_name)
	local fail, cats, sections = nil, {}, nil
	-- See [[Module:languages/data]].
	if not text:match("\244") or require("Módulo:lenguas/idiomas/puntuacion").contiguous_substitution[cod] then
		sections = {text}
	else
		sections = mw.text.split(text, "[􀀀-􏿽]")
	end
	for i, section in ipairs(sections) do
		-- Don't bother processing empty strings or whitespace (which may also not be handled well by dedicated modules).
		if section:gsub("%s", "") ~= "" then
			local sub, sub_fail, sub_cats = require("Módulo:String/sustituir_rec")(section, cod, idioma, sc, substitution_data, function_name)
			-- Second round of temporary substitutions, in case any formatting was added by the main substitution process. However, don't do this if the section contains formatting already (as it would have had to have been escaped to reach this stage, and therefore should be given as raw text).
			if sub and subbedChars then
				local noSub
				for _, pattern in ipairs(patterns_) do
					if section:match(pattern .. "%z?") then
						noSub = true
					end
				end
				if not noSub then
					sub, subbedChars = doTempSubstitutions(sub, subbedChars, keepCarets, true)
				end
			end
			if (not sub) or sub_fail then
				text = sub
				fail = sub_fail
				cats = sub_cats or {}
				break
			end
			text = sub and text:gsub(escapar(section), escapar(sub), 1) or text
			if type(sub_cats) == "table" then
				for _, cat in ipairs(sub_cats) do
					insert(cats, cat)
				end
			end
		end
	end
	
	-- Trim, unless there are only spacing characters, while ignoring any final formatting characters.
	text = text and text
		:gsub("^([\128-\191\244]*)%s+(%S)", "%1%2")
		:gsub("(%S)%s+([\128-\191\244]*)$", "%1%2")
	
	-- Remove duplicate categories.
	if #cats > 1 then
		cats = require("Módulo:tabla").removeDuplicates(cats)
	end
	
	return text, fail, cats, subbedChars
end