Module:Language/scripts: Difference between revisions
Content added Content deleted
Vivaporius (talk | contribs) m (1 revision imported) |
en>Centrist16 (Created page with "local p = {} local gsub = mw.ustring.gsub local length = mw.ustring.len local floor = math.floor local UTF8Char = "[%z\1-\127\194-\244][\128-\191]*" local codepoint_data = mw...") |
||
Line 1: | Line 1: | ||
local p = {} |
local p = {} |
||
local gsub = mw.ustring.gsub |
local gsub = mw.ustring.gsub |
||
local length = mw.ustring.len |
local length = mw.ustring.len |
||
local floor = math.floor |
|||
local UTF8Char = "[%z\1-\127\194-\244][\128-\191]*" |
|||
local codepoint_data = mw.loadData("Module:language/scripts/codepoints") |
|||
local data = require("Module:Language/scripts/data") |
local data = require("Module:Language/scripts/data") |
||
Line 41: | Line 44: | ||
local text = frame.args[1] |
local text = frame.args[1] |
||
return p.isLatn(text) |
return p.isLatn(text) |
||
end |
|||
local ignore_script = require("Module:TableTools").listToSet{ |
|||
"Zinh", "Zyyy", "Zzzz" |
|||
} |
|||
local function map(func, t) |
|||
local array = {} |
|||
if t[1] then |
|||
for i, v in ipairs(t) do |
|||
array[i] = func(v, i, t) |
|||
end |
|||
else |
|||
local i = 0 |
|||
for k, v in pairs(t) do |
|||
i = i + 1 |
|||
array[i] = func(v, k, t) |
|||
end |
|||
end |
|||
return array |
|||
end |
|||
local function filter(t, func) |
|||
local new_t = {} |
|||
if t[1] then |
|||
local new_t_i = 0 |
|||
for i, v in ipairs(t) do |
|||
if func(v, i, t) then |
|||
new_t_i = new_t_i + 1 |
|||
new_t[new_t_i] = v |
|||
end |
|||
end |
|||
else |
|||
for k, v in pairs(t) do |
|||
if func(v, k, t) then |
|||
new_t[k] = v |
|||
end |
|||
end |
|||
end |
|||
return new_t |
|||
end |
|||
local function sortRange(range1, range2) |
|||
return range1[1] < range2[1] |
|||
end |
|||
--[[ |
|||
Binary search: efficient for long lists of codepoint ranges. |
|||
]] |
|||
local function binarySearch(ranges, value) |
|||
if not ranges then |
|||
return nil |
|||
end |
|||
-- Initialize numbers. |
|||
local bottom, i, top = 1, 0, ranges.length |
|||
if top == 0 then |
|||
return nil |
|||
end |
|||
-- Do search. |
|||
while bottom <= top do |
|||
-- Calculate current index. |
|||
i = floor((bottom + top) / 2) |
|||
-- Get range array; for instance, { 0x41, 0x7A, "Latn"}. |
|||
local range = ranges[i] |
|||
if value < range[1] then |
|||
top = i - 1 |
|||
-- Return matching range array so that it can be placed in cache. |
|||
elseif value <= range[2] then |
|||
return range |
|||
else |
|||
bottom = i + 1 |
|||
end |
|||
end |
|||
return nil |
|||
end |
|||
--[[ |
|||
-- For debugging |
|||
local function toHex(number) |
|||
return ("0x%X"):format(number) |
|||
end |
|||
local function logRange(range, number) |
|||
return mw.log(toHex(range[1]), toHex(number) .. " (" .. mw.ustring.char(number) .. ")", toHex(range[2]), range[3]) |
|||
end |
|||
--]] |
|||
local function lookUpInOrder(number, ranges) |
|||
for i, range in ipairs(ranges) do |
|||
if number < range[1] then |
|||
return nil |
|||
elseif number <= range[2] then |
|||
return range[3] |
|||
end |
|||
end |
|||
end |
|||
-- Save previously used codepoint ranges in case another character is in the |
|||
-- same range. |
|||
local rangesCache = {} |
|||
--[=[ |
|||
Takes a codepoint and returns the script code that is appropriate for it, |
|||
based on the data module [[Module:Language/scripts/codepoints]]. |
|||
The data module uses the official Unicode script codes. |
|||
Returns a script code from the codepoint-to-script map, or one of the ranges |
|||
in the array of ranges, else returns Zzzz. |
|||
]=] |
|||
function p.codepointToScript(codepoint) |
|||
local lookup = codepoint_data |
|||
local t = type(codepoint) |
|||
if t ~= "number" then |
|||
error("Argument to codepointToScript should be a number, but its type is " .. t .. ".") |
|||
end |
|||
local individualMatch = lookup.individual[codepoint] |
|||
if individualMatch then |
|||
return individualMatch |
|||
else |
|||
local script = lookUpInOrder(codepoint, rangesCache) |
|||
if script then |
|||
return script |
|||
end |
|||
local range = binarySearch(lookup.ranges, codepoint) |
|||
if range then |
|||
table.insert(rangesCache, range) |
|||
table.sort(rangesCache, sortRange) |
|||
return range[3] |
|||
end |
|||
end |
|||
return "Zzzz" |
|||
end |
|||
function p.charToScript(char) |
|||
return p.codepointToScript(mw.ustring.codepoint(char)) |
|||
end |
|||
function p.countScripts(text) |
|||
if type(text) ~= "string" then |
|||
error("countScripts requires a string") |
|||
end |
|||
local scriptCounts = {} |
|||
local codepointToScript = p.codepointToScript |
|||
for codepoint in mw.ustring.gcodepoint(text) do |
|||
local script = codepointToScript(codepoint) |
|||
if script then |
|||
if not scriptCounts[script] then |
|||
scriptCounts[script] = 0 |
|||
end |
|||
scriptCounts[script] = scriptCounts[script] + 1 |
|||
end |
|||
end |
|||
return scriptCounts |
|||
end |
|||
function p.getScript(text) |
|||
local scripts = {} |
|||
local i = 0 |
|||
for code in pairs(p.countScripts(text)) do |
|||
i = i + 1 |
|||
scripts[i] = code |
|||
end |
|||
scripts = filter(scripts, |
|||
function (scCode) |
|||
return not ignore_script[scCode] |
|||
end) |
|||
if not scripts[2] then |
|||
return scripts[1] |
|||
end |
|||
end |
|||
function p.showScripts(frame) |
|||
return table.concat( |
|||
map(function(arg) |
|||
return "* " .. arg .. ": " .. table.concat( |
|||
map(function(count, script) |
|||
return script .. " (" .. count .. ")" |
|||
end, |
|||
p.countScripts(arg)), |
|||
", ") |
|||
end, |
|||
frame.args), |
|||
"\n") |
|||
end |
end |
||