模組:Hani-sortkey

下列說明文檔位於Module:Hani-sortkey/doc。^[編輯]

This module will sort text in the 漢字. It is used to sort 錦語, 南奄美大島語, 中部白語, 北部白語, 南部白語, 標敏語, 碧約語, 閩東語, 晉語, 官話, 北部平話, 洋涇浜英語, 莆仙語, 澳門皮欽葡萄牙語, 南部平話, 徽語, 閩中語, 東干語, 達斡爾語, 誒話, 贛語, 客家語, 湘語, 日語, 八丈語, 喜界語, 拉瑪白語, 中古漢語, 文言文, 中古越南語, 高欄語, 閩北語, 跨語言, 宮古語, 閩南語, 泉漳話, 海南話, 雷州話, 潮州話, 儂語, 上古漢語, 古典日語, 沖永良部語, 古回鶻語, 布依語, 百濟語, Rouran, 吐谷渾語, 拓跋語, 烏桓語, 鮮卑語, 北奄美大島語, 八重山語, 沖繩語, 水語, 白狼語, 德之島語, 阿勒楚喀語, 巴拉語, 恰喀拉語, 岱依語, 羯語, 越南語, 吳語, 瓦鄉話, 古典藏語, 中古蒙古語, 扶餘語, 國頭語, 與那國語, 與論語, 粵語, 壯語, 柔若語, 漢語, 韶州土話, 四川話, 台山話, 高句麗語, 扎話, and 契丹語. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{sortkey}}. Within a module, use Module:languages#Language:makeSortKey.

For testcases, see Module:Hani-sortkey/testcases.

Functions

makeSortKey(text, lang, sc): Generates a sortkey for a given piece of text written in the script specified by the code sc, and language specified by the code lang.; When the sort fails, returns nil.

產生如下所示內容的示範函數位於Module:Hani-sortkey/templates中。模組的修改可以在Module:Hani-sortkey/sandbox中進行測試。單一字元的排序鍵是從178個資料模組之一檢索的。Module:Hani-sortkey/data為這些模組建立文件。

顯示排序鍵

PS/2接口 (PS/2工44酉39)
gas爐／gas炉 (gas山39)
γ粒子 (γ己39工40)
濕𣲷𣲷／湿𣲷𣲷 (工38幺-1幺-1)
得個……字 (己43見38……丿38)
命裡有時終須有，命裡無時莫強求 (广157水38口39工47幺54皿38口39，广157水38廴36工47弓65己40工42)
赛车 (水39廾76)

鿪 (己39)

阿坝 (工40己46)
𡎇 (工43)
九 (己44)
𡆔 (弓135)

表意文字描述序列

⿰亻革 (⿰虫38疒39)
⿰亻革家語／⿰亻革家语 (⿰虫38疒39工39弋75)
⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵／⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心面 (⿺火38⿳手38⿲己39⿱⿲工47工45工47⿲瓦39矛38瓦39工41巾00⿺火38⿳手38⿲己39⿱⿲工47工45工47⿲瓦39矛38瓦39工41巾00工43)

local export = {}

local m_str_utils = require("Module:string utilities")

local codepoint = m_str_utils.codepoint
local concat = table.concat
local explode_utf8 = m_str_utils.explode_utf8
local insert = table.insert
local u = m_str_utils.char

local m_data = require("Module:Hani-sortkey/data/serialized")
local m_data_core = mw.loadData("Module:Hani-sortkey/data/core")
local cache = {}

--[[
	Returns the index in the string where the ideographic description sequence
	(IDS) ends, or the index of the end of the string. Iterates whenever
	another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
	if not (text and IDchar and i) then
		return nil
	end
	
	local j = i
	local component = 1
	
	-- Number of components expected after current IDC.
	local components = m_data_core.ids[IDchar]
	
	while component <= components do
		j = j + 1
		
		local char = text[j]
		
		if not char then
			break
		elseif m_data_core.ids[char] then
			j = findEndOfIDS(text, char, j)
		end
		
		component = component + 1
	end
	
	--[[
		If the expected number of components has been found,
		return the current index in the text.
	]]
	if component - components == 1 then
		return j
	else
		return nil
	end
end

local function unserialize(a, b)
	return m_data_core.radicals[a:byte()] .. ("%02d"):format(b:byte() - 10)
end

-- The data is stored in [[Module:Hani-sortkey/data]]. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as [[Module:Hani-sortkey/data/serialized]]. If the data is changed, the new serialized data can be generated with [[Module:Hani-sortkey/data/serializer]].
function export.getData(char)
	if type(char) == "string" then
		char = codepoint(char)
	elseif type(char) ~= "number" then
		error("getData must operate on a single character or codepoint.")
	end
	local offset, s, f, lookup = 0
	for i = 2, m_data_core.ranges.n, 2 do
		s, f = m_data_core.ranges[i - 1], m_data_core.ranges[i]
		if char > f then
			offset = offset + f - s + 1
		elseif char >= s and char <= f then
			lookup = 2 * (offset + char - s + 1)
			return m_data:sub(lookup - 1, lookup):gsub("(.)(.)", unserialize)
		end
	end
	return u(char)
end

function export.makeSortKey(text, lang, sc)
	local scripts = {
		Hani = true,
		Hans = true,
		Hant = true,
		Jpan = true,
		Kore = true
	}
	if sc and not scripts[sc] then
		return text:uupper()
	end
	
	local sort = {}
	text = explode_utf8(text)
	local text_len = #text
	local i, char = 0
	while i < text_len do
		i = i + 1
		char = text[i]
		
		if m_data_core.preconvert[char] then
			local j = 0
			for c in m_data_core.preconvert[char]:gmatch(".[\128-\191]*") do
				if j == 0 then
					text[i] = c
				else
					insert(text, i + j, c)
				end
				j = j + 1
			end
			char = text[i]
			text_len = #text
		end
		--[=[
			If we encounter an ideographic description character (IDC),
			find out if it begins a valid ideographic description sequence (IDS).
			
			If the IDS is valid and a sortkey for it is listed in
			[[Module:Hani-sortkey/data/unsupported]], then return
			the sortkey, and move to the next character after the
			IDS.
			
			Otherwise, insert the IDC into the sortkey and move to the next
			character after the IDC.
			
			If the IDS is valid and no sortkey for it is found, track it.
		]=]
		if m_data_core.ids[char] then
			local j = findEndOfIDS(text, char, i)
			local IDS, data
			if j then
				IDS = concat(text, nil, i, j)
				data = m_data_core.unsupported[IDS]
			end
			
			if not data then
				if IDS then
					require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
					mw.log("ideographic description sequence without sortkey: '"
						.. IDS .. "'")
				else
					require("Module:debug").track("Hani-sortkey/invalid-IDS")
					mw.log("invalid ideographic description sequence at the beginning of '"
						.. text[i] .. "'")
				end
			end
			if IDS and data then
				insert(sort, data)
				i = j
			else
				insert(sort, char)
			end
		else
			if not cache[char] then
				cache[char] = export.getData(char)
			end
			insert(sort, cache[char])
		end
	end
	
	return concat(sort)
end

return export