Module:Language/scripts

From Omniversalis

Documentation for this module may be created at Module:Language/scripts/doc

local p = {}
local gsub = mw.ustring.gsub
local length = mw.ustring.len
local floor = math.floor
local UTF8Char = "[%z\1-\127\194-\244][\128-\191]*"

local codepoint_data = mw.loadData("Module:language/scripts/codepoints")

local data = require("Module:Language/scripts/data")

function p.print(frame)
	local scriptCode = frame.args[1]
	local scriptData = scriptCode and data[scriptCode] or "Please supply a valid script code."
	local characters = scriptData and scriptData.characters or "No characters found for " .. scriptCode .. "."
	return characters
end

local script = {}

-- Based on the Script:countCharacters() function of Module:scripts on Wiktionary
local function countCharacters(text, scriptCode)
	if not data[scriptCode]["characters"] then
		return 0
	else
		local _, count = gsub(text, "[" .. data[scriptCode]["characters"] .. "]", "")
		return count
	end
end

function p.isLatn(text)
	if type(tostring(text)) == "string" then
		local count = countCharacters(text, "Latn")
		if count < (length(text) / 4) then -- Only 25% of characters in string are Latin
			return false
		else
			return true
		end
	else
		return nil
	end
end

function p.Latin(frame)
	local text = frame.args[1]
	return p.isLatn(text)
end

local ignore_script = require("Module:TableTools").listToSet{
	"Zinh", "Zyyy", "Zzzz"
}

local function map(func, t)
	local array = {}
	if t[1] then
		for i, v in ipairs(t) do
			array[i] = func(v, i, t)
		end
	else
		local i = 0
		for k, v in pairs(t) do
			i = i + 1
			array[i] = func(v, k, t)
		end
	end
	return array
end

local function filter(t, func)
	local new_t = {}
	
	if t[1] then
		local new_t_i = 0
		for i, v in ipairs(t) do
			if func(v, i, t) then
				new_t_i = new_t_i + 1
				new_t[new_t_i] = v
			end
		end
	else
		for k, v in pairs(t) do
			if func(v, k, t) then
				new_t[k] = v
			end
		end
	end
	
	return new_t
end

local function sortRange(range1, range2)
	return range1[1] < range2[1]
end

--[[
	Binary search: efficient for long lists of codepoint ranges.
]]
local function binarySearch(ranges, value)
	if not ranges then
		return nil
	end
	
	--	Initialize numbers.
	local bottom, i, top = 1, 0, ranges.length

	if top == 0 then
		return nil
	end

	-- Do search.
	while bottom <= top do
		-- Calculate current index.
		i = floor((bottom + top) / 2)

		-- Get range array; for instance, { 0x41, 0x7A, "Latn"}.
		local range = ranges[i]

		if value < range[1] then
			top = i - 1

		-- Return matching range array so that it can be placed in cache.
		elseif value <= range[2] then
			return range

		else
			bottom = i + 1
		end
	end
	
	return nil
end

--[[
-- For debugging
local function toHex(number)
	return ("0x%X"):format(number)
end

local function logRange(range, number)
	return mw.log(toHex(range[1]), toHex(number) .. " (" .. mw.ustring.char(number) .. ")", toHex(range[2]), range[3])
end
--]]

local function lookUpInOrder(number, ranges)
	for i, range in ipairs(ranges) do
		if number < range[1] then
			return nil
		elseif number <= range[2] then
			return range[3]
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

--[=[
	Takes a codepoint and returns the script code that is appropriate for it,
	based on the data module [[Module:Language/scripts/codepoints]].
	
	The data module uses the official Unicode script codes.

	Returns a script code from the codepoint-to-script map, or one of the ranges
	in the array of ranges, else returns Zzzz.
]=]
function p.codepointToScript(codepoint)
	local lookup = codepoint_data
	local t = type(codepoint)
	if t ~= "number" then
		error("Argument to codepointToScript should be a number, but its type is " .. t .. ".")
	end

	local individualMatch = lookup.individual[codepoint]
	if individualMatch then
		return individualMatch
	else
		local script = lookUpInOrder(codepoint, rangesCache)
		if script then
			return script
		end

		local range = binarySearch(lookup.ranges, codepoint)
		if range then
			table.insert(rangesCache, range)
			table.sort(rangesCache, sortRange)
			return range[3]
		end
	end

	return "Zzzz"
end

function p.charToScript(char)
	return p.codepointToScript(mw.ustring.codepoint(char))
end

function p.countScripts(text)
	if type(text) ~= "string" then
		error("countScripts requires a string")
	end
	local scriptCounts = {}
	local codepointToScript = p.codepointToScript
	for codepoint in mw.ustring.gcodepoint(text) do
		local script = codepointToScript(codepoint)
		if script then
			if not scriptCounts[script] then
				scriptCounts[script] = 0
			end
			scriptCounts[script] = scriptCounts[script] + 1
		end
	end
	
	return scriptCounts
end

function p.getScript(text)
	local scripts = {}
	local i = 0
	for code in pairs(p.countScripts(text)) do
		i = i + 1
		scripts[i] = code
	end
	
	scripts = filter(scripts,
		function (scCode)
			return not ignore_script[scCode]
		end)
	
	if not scripts[2] then
		return scripts[1]
	end
end

function p.showScripts(frame)
	return table.concat(
		map(function(arg)
				return "* " .. arg .. ": " .. table.concat(
					map(function(count, script)
							return script .. " (" .. count .. ")"
						end,
						p.countScripts(arg)),
					", ")
			end,
			frame.args),
		"\n")
end

return p