Módulo:String/avanzado

De Wikcionario, el diccionario libre

La documentación para este módulo puede ser creada en Módulo:String/avanzado/doc

-- tomado de https://en.wiktionary.org/wiki/Module:utilities
-- y de https://en.wiktionary.org/wiki/Module:links

local decode = mw.text.decode
local u = mw.ustring.char

local export = {}

-- A helper function to resolve HTML entities into plaintext.
do
	local entities
	
	local function get_named_entity(entity)
		entities = entities or mw.loadData("Módulo:datos/entidades")
		return entities[entity]
	end
	
	-- Catches entities with capital X, which aren't supported by default.
	local function get_numbered_entity(entity)
		entity = entity:lower()
		local ret = decode(entity)
		if ret ~= entity then
			return ret
		end
	end
		
	function export.get_entities(text)
		return (text:gsub("&([^#&;]+);", get_named_entity)
			:gsub("&#[Xx]?%x+;", get_numbered_entity)
		)
	end
end

--function export.get_entities(text)
--	return (text:gsub("&([^#&;]+);", get_named_entity)
--		:gsub("&#[Xx]?%x+;", get_numbered_entity)
--	)
--end

-- A helper function to convert plaintext into HTML entities where these match the characters given in set.
-- By default, this resolves any pre-existing entities into plaintext first, to allow mixed input and to avoid accidental double-conversion. This can be turned off with the raw parameter.
function export.make_entities(text, set, raw)
	text = not raw and export.get_entities(text) or text
	return mw.text.encode(text, set)
end

function export.remove_links(text, tag)
	if type(text) == "table" then
		text = text.args[1]
	end

	if not text or text == "" then
		return ""
	end
	
	text = text
		:gsub("%[%[", "\1")
		:gsub("%]%]", "\2")

	-- Parse internal links for the display text.
	text = text:gsub("(\1)([^\1\2]-)(\2)",
		function(c1, c2, c3)
			-- Don't remove files.
			for _, falsePositive in ipairs({"file", "image", "archivo", "imagen"}) do
				if c2:lower():match("^" .. falsePositive .. ":") then return c1 .. c2 .. c3 end
			end
			-- Remove categories completely.
			for _, falsePositive in ipairs({"category", "categoría", "cat"}) do
				if c2:lower():match("^" .. falsePositive .. ":") then return "" end
			end
			-- In piped links, remove all text before the pipe, unless it's the final character (i.e. the pipe trick), in which case just remove the pipe.
			c2 = c2:match("^[^|]*|(.+)") or c2:match("([^|]+)|$") or c2
			if tag then
				return "<link>" .. c2 .. "</link>"
			else
				return c2
			end
		end)
		
	text = text
		:gsub("\1", "[[")
		:gsub("\2", "]]")

	return text
end


-- A helper function to strip wiki markup, giving the plaintext of what is displayed on the page.
function export.get_plaintext(text)
	text = text
		:gsub("%[%[", "\1")
		:gsub("%]%]", "\2")
	
	-- Remove strip markers and HTML tags.
	text = mw.text.unstrip(text)
		:gsub("<[^<>\1\2]+>", "")
		
	-- Parse internal links for the display text, and remove categories.
	text = export.remove_links(text)
	
	-- Remove files.
	for _, falsePositive in ipairs({"File", "Image"}) do
		text = text:gsub("\1" .. falsePositive .. ":[^\1\2]+\2", "")
	end

	-- Parse external links for the display text.
	text = text:gsub("%[(https?://[^%[%]]+)%]",
		function(capture)
			return capture:match("https?://[^%s%]]+%s([^%]]+)") or ""
		end)
	
	text = text
		:gsub("\1", "[[")
		:gsub("\2", "]]")
	
	-- Any remaining square brackets aren't involved in links, but must be escaped to avoid creating new links.
	text = text:gsub("[%[%]]", mw.text.nowiki)
		
	-- Strip bold, italics and soft hyphens.
	text = text
		:gsub("('*)'''(.-'*)'''", "%1%2")
		:gsub("('*)''(.-'*)''", "%1%2")
		:gsub("­", "")
	
	-- Get any HTML entities.
	-- Note: don't decode URL percent encoding, as it shouldn't be used in display text and may cause problems if % is used.
	text = export.get_entities(text)
	
	return mw.text.trim(text)
end

-- A helper function to return the content of a page section.
-- `content` is raw wikitext, `name` is the requested section, and `level` is an optional parameter that specifies the required section heading level. If `level` is not supplied, then the first section called `name` is returned.
-- `name` can either be a string or table of section names. If a table, each name represents a section that has the next as a subsection. For example, {"Spanish", "Noun"} will return the first matching section called "Noun" under a section called "Spanish". These do not have to be at adjacent levels ("Noun" might be L4, while "Spanish" is L2). If `level` is given, it refers to the last name in the table (i.e. the name of the section to be returned).
-- The returned section includes all of its subsections.
-- If no matching section is found, returns nil.
function export.get_section(content, names, level)
	local trim = mw.text.trim
	local function _section(content, name, level)
		if not (content and name) then
			return nil
		elseif level and level > 6 then
			error("Heading level cannot be greater than 6.")
		elseif name:find("[\n\r]") then
			error("Heading name cannot contain a newline.")
		end
		name = trim(name)
		local start
		for loc, lvl, sec in content:gmatch("()%f[^%z\n\r](=+)([^\n\r]+)%2[\t ]*%f[%z\n\r]") do
			lvl = #lvl
			if not start then
				if lvl > 6 then
					local ex = ("="):rep(lvl - 6)
					sec = ex .. sec .. ex
					lvl = 6
				end
				if (
					(not level or lvl == level) and
					trim(sec) == name
				) then
					start = loc
					level = lvl
				end
			elseif level == 6 or lvl <= level then
				return content:sub(start, loc - 1)
			end
		end
		return start and content:sub(start)
	end
	
	if type(names) == "string" then
		return _section(content, names, level)
	else
		local names_len = #names
		if names_len > 6 then
			error("Not possible specify more than 5 subsections: headings only go up to level 6.")
		end
		for i, name in ipairs(names) do
			if i == names_len then
				content = _section(content, name, level)
			else
				content = _section(content, name)
			end
		end
		return content
	end
end

do
	local get_script = require("Módulo:scripts").getByCode
	
	--[=[
	Finds the best script for a string in a language-agnostic way.
	
	Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list
	of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to.
	
	Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the
	first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared
	(i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are
	used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `Polyt` if no characters
	which exclusively match `Polyt` are found, as `Grek` is a subset of `Polyt`.
	]=]
	local function findBestScriptWithoutLang(text)
		-- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts.
		local scripts_mt = 	{Jpan = true, Kore = true}
		
		local weights_mt = {
			__lt = function(a, b)
				if a[1] + a[2] ~= b[1] + b[2] then
					return a[1] + a[2] < b[1] + b[2]
				elseif a[1] ~= b[1] then
					return a[1] < b[1]
				elseif a[2] ~= b[2] then
					return a[2] < b[2]
				else
					return false
				end
			end
		}
		scripts_mt.__index = function(t, k)
			local ret = {}
			if k == "Jpan" and scripts_mt.Jpan then
				for i = 1, 2 do
					ret[i] = t["Hani"][i] + t["Hira"][i] + t["Kana"][i]
				end
			elseif k == "Kore" and scripts_mt.Kore then
				for i = 1, 2 do
					ret[i] = t["Hani"][i] + t["Hang"][i]
				end
			else
				for i = 1, 2 do
					table.insert(ret, 0)
				end
			end
			return setmetatable(ret, weights_mt)
		end
		
		local scripts = setmetatable({}, scripts_mt)
		
		text = export.get_plaintext(text)
		
		local combined_scripts = {
			Jpan = {["Hani"] = true, ["Hira"] = true, ["Kana"] = true},
			Kore = {["Hani"] = true, ["Hang"] = true}
		}
	
		for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
			for i, script in ipairs(require("Módulo:scripts/charAScript").charAScript(character, true)) do
				scripts[script] = scripts[script]
				local weight = math.min(i, 2)
				scripts[script][weight] = scripts[script][weight] + 1
			end
		end
		
		-- Check the combined script counts. If a single constituent has the same count (i.e. it's the only one), discard the combined script.
		for combined_script, set in pairs(combined_scripts) do
			for script in pairs(set) do
				scripts[combined_script] = scripts[combined_script]
				if (scripts[script][1] + scripts[script][2]) == (scripts[combined_script][1] + scripts[combined_script][2]) then
					scripts[combined_script] = nil
					break
				end
			end
		end
		
		local bestScript
		local greatestCount
		for script, count in pairs(scripts) do
			if (not greatestCount) or greatestCount < count then
				bestScript = script
				greatestCount = count
			end
		end
		
		bestScript = bestScript or "None"
		
		return get_script(bestScript)
	end

	local function findBestScriptWithLang(text, idioma, scripts)
		-- Remove all formatting characters.
		text = export.get_plaintext(text)
		
		-- Try to match every script against the text,
		-- and return the one with the most matching characters.
		local bestcount, bestscript = 0
		
		-- Remove any spacing or punctuation characters, and get resultant length.
		-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
		local reducedText = mw.ustring.gsub(text, "[%s%p]+", "")
		local _, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
		
		-- If the length is 0 then we're probably dealing with a punctuation character, so only remove spacing characters, in case it is script-specific.
		if length == 0 then
			reducedText = mw.ustring.gsub(text, "%s+", "")
			_, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
			
			if length == 0 then
				return get_script("None")
			end
		end
		
		-- Ensure that "Hant", "Hans" and "Hani" are moved to the end of the list (in that order, if present), as they are a special-case.
		local oldScripts, Hant, Hans, Hani, finalCheck = scripts, nil, nil, nil, nil
		scripts = {}
		for _, script in pairs(oldScripts) do
			if script == "Hant" then
				Hant = true
			elseif script == "Hans" then
				Hans = true
			elseif script == "Hani" then
				Hani = true
			else
				table.insert(scripts, get_script(script))
			end
		end
		if Hant then table.insert(scripts, get_script("Hant")); finalCheck = true end
		if Hans then table.insert(scripts, get_script("Hans")); finalCheck = true end
		if Hani then table.insert(scripts, get_script("Hani")) end
		
		for i, script in ipairs(scripts) do
			local count = script:countCharacters(reducedText) -- Esta función cuenta sólo los caracteres que son parte de ese script, que es lo que necesitamos
			
			-- Special case for "Hant", "Hans" and "Hani", which are returned if they match at least one character, under the assumption that (1) traditional and simplified characters will not be mixed if a language uses both scripts, and (2) any terms using Han characters with another script (e.g. Latin) will still need a Han code (not counting those which use Jpan or Kore). This is for efficiency, due to the special checks required for "Hant" and "Hans", and to prevent "Hani" from overriding either, as it will always match with at least as many characters, while characters used in both will only match with "Hani".
			if count >= length or ((script._code == "Hant" or script._code == "Hans" or script._code == "Hani") and count > 0) then
				return script
			elseif count > bestcount then
				bestcount = count
				bestscript = script
			end
		end
		
		-- Secondary check for languages that have "Hant" or "Hans" but not "Hani", but which still have multiple scripts (e.g. Macau Pidgin Portuguese): characters which are not exclusively traditional or simplified will not be found by the main check, so a separate "Hani" check is necessary to see if Han characters are present at all. If successful, return "Hant" or "Hans" as applicable.
		if finalCheck and not Hani then
			for _, script in ipairs(scripts) do
				if (script._code == "Hant" or script._code == "Hans") and (get_script("Hani"):countCharacters(reducedText) > 0) then return script
				end
			end
		end
		
		if bestscript then
			return bestscript
		end
		
		-- No matching script was found, so return "None".
		return get_script("None")
	end
		
		
	function export.findBestScript(text, idioma)
		if (not text) or text == "" or text == "-" then
			return get_script("None")
		end
		
		if not idioma or idioma[4] == "All" then
			return findBestScriptWithoutLang(text)
		end
		
		local scripts = {}
		local i_ = 1
		
		for script in idioma[4]:gmatch("([^,]+)%s*,?%s*") do
			scripts[i_] = script
			i_ = i_ + 1
		end
		
		if not scripts[2] and scripts[1] and scripts[1] ~= "" then
			return get_script(scripts[1])
		end
		
		return findBestScriptWithLang(text, idioma, scripts)
	end
end

return export