Module:Vi2ipa 2

From Wikipedia

Documentation for this module may be created at Module:Vi2ipa 2/doc

---Transcribes a Vietnamese word or compound word into IPA. Supports
-- pronunciations in some of the main dialects of Vietnam.
-- 
-- This module is intended as a replacement for the lumbering monstrosity
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron]], which itself is a replacement for the
-- editor-unfriendly [[Wikipedia:wikt:vi:Bản mẫu:VieIPA]].
local vi2ipa = {}
vi2ipa.dialects = require("Module:Vi2ipa 2/dialects")

local dialects = {
	"Hà Nội", "Hải Phòng",
	"Vinh", "Thanh Chương", "Hà Tĩnh",
	"Huế",
	"Quy Nhơn", "Sài Gòn"
}

---Pattern that captures UTF-8 characters into an array.
-- http://lua-users.org/wiki/LuaUnicode
local unichar_pattern = "([%z\1-\127\194-\244][\128-\191]*)"

local upper = "ĐÂĂÊÔƠƯÁẤẮÉẾÍÓỐỚÚỨÝÀẦẰÈỀÌÒỒỜÙỪỲẢẨẲẺỂỈỎỔỞỦỬỶÃẪẴẼỄĨÕỖỠŨỮỸẠẬẶẸỆỊỌỘỢỤỰỴ"
local lower = "đâăêôơưáấắéếíóốớúứýàầằèềìòồờùừỳảẩẳẻểỉỏổởủửỷãẫẵẽễĩõỗỡũữỹạậặẹệịọộợụựỵ"
local upperToLower = {
	["Đ"] = "đ",
	["A"] = "a", ["Á"] = "á", ["À"] = "à", ["Ã"] = "ã", ["Ả"] = "ả", ["Ạ"] = "ạ",
	["Â"] = "â", ["Ấ"] = "ấ", ["Ầ"] = "ầ", ["Ẫ"] = "ẫ", ["Ẩ"] = "ẩ", ["Ậ"] = "ậ",
	["Ă"] = "ă", ["Ắ"] = "ắ", ["Ằ"] = "ằ", ["Ẵ"] = "ẵ", ["Ẳ"] = "ẳ", ["Ặ"] = "ặ",
	["E"] = "e", ["É"] = "é", ["È"] = "è", ["Ẽ"] = "ẽ", ["Ẻ"] = "ẻ", ["Ẹ"] = "ẹ",
	["Ê"] = "ê", ["Ế"] = "ế", ["Ề"] = "ề", ["Ễ"] = "ễ", ["Ể"] = "ể", ["Ệ"] = "ệ",
	["I"] = "i", ["Í"] = "í", ["Ì"] = "ì", ["Ĩ"] = "ĩ", ["Ỉ"] = "ỉ", ["Ị"] = "ị",
	["O"] = "o", ["Ó"] = "ó", ["Ò"] = "ò", ["Õ"] = "õ", ["Ỏ"] = "ỏ", ["Ọ"] = "ọ",
	["Ô"] = "ô", ["Ố"] = "ố", ["Ồ"] = "ồ", ["Ỗ"] = "ỗ", ["Ổ"] = "ổ", ["Ộ"] = "ộ",
	["Ơ"] = "ơ", ["Ớ"] = "ớ", ["Ờ"] = "ờ", ["Ỡ"] = "ỡ", ["Ở"] = "ở", ["Ợ"] = "ợ",
	["U"] = "u", ["Ú"] = "ú", ["Ù"] = "ù", ["Ũ"] = "ũ", ["Ủ"] = "ủ", ["Ụ"] = "ụ",
	["Ư"] = "ư", ["Ứ"] = "ứ", ["Ừ"] = "ừ", ["Ữ"] = "ữ", ["Ử"] = "ử", ["Ự"] = "ự",
	["Y"] = "y", ["Ý"] = "ý", ["Ỳ"] = "ỳ", ["Ỹ"] = "ỹ", ["Ỷ"] = "ỷ", ["Ỵ"] = "ỵ"
}
setmetatable(upperToLower, {
	__index = function (name, u)
		name[u] = u:lower()
		return name[u]
	end
})

---Returns a copy of the given string with all uppercase Vietnamese letters
-- changed to lowercase.
-- @see string.lower
function vi2ipa.lower(s)
	return s:gsub(unichar_pattern, upperToLower)
end

---Table mapping vowel characters to their toneless base letters.
local vowelsToBases = {
	["a"] = "a",  ["á"] = "a",  ["à"] = "a",  ["ã"] = "a",  ["ả"] = "a",  ["ạ"] = "a",
	["â"] = "â",  ["ấ"] = "â",  ["ầ"] = "â",  ["ẫ"] = "â",  ["ẩ"] = "â",  ["ậ"] = "â",
	["ă"] = "ă",  ["ắ"] = "ă",  ["ằ"] = "ă",  ["ẵ"] = "ă",  ["ẳ"] = "ă",  ["ặ"] = "ă",
	["e"] = "e",  ["é"] = "e",  ["è"] = "e",  ["ẽ"] = "e",  ["ẻ"] = "e",  ["ẹ"] = "e",
	["ê"] = "ê",  ["ế"] = "ê",  ["ề"] = "ê",  ["ễ"] = "ê",  ["ể"] = "ê",  ["ệ"] = "ê",
	["i"] = "i",  ["í"] = "i",  ["ì"] = "i",  ["ĩ"] = "i",  ["ỉ"] = "i",  ["ị"] = "i",
	["o"] = "o",  ["ó"] = "o",  ["ò"] = "o",  ["õ"] = "o",  ["ỏ"] = "o",  ["ọ"] = "o",
	["ô"] = "ô",  ["ố"] = "ô",  ["ồ"] = "ô",  ["ỗ"] = "ô",  ["ổ"] = "ô",  ["ộ"] = "ô",
	["ơ"] = "ơ",  ["ớ"] = "ơ",  ["ờ"] = "ơ",  ["ỡ"] = "ơ",  ["ở"] = "ơ",  ["ợ"] = "ơ",
	["u"] = "u",  ["ú"] = "u",  ["ù"] = "u",  ["ũ"] = "u",  ["ủ"] = "u",  ["ụ"] = "u",
	["ư"] = "ư",  ["ứ"] = "ư",  ["ừ"] = "ư",  ["ữ"] = "ư",  ["ử"] = "ư",  ["ự"] = "ư",
	["y"] = "y",  ["ý"] = "y",  ["ỳ"] = "y",  ["ỹ"] = "y",  ["ỷ"] = "y",  ["ỵ"] = "y"
}

---Table mapping vowel characters to the VIQR representation of their tones.
local vowelsToVIQRTones = {
	["a"] = "",  ["á"] = "'",  ["à"] = "`",  ["ã"] = "~",  ["ả"] = "?",  ["ạ"] = ".",
	["â"] = "",  ["ấ"] = "'",  ["ầ"] = "`",  ["ẫ"] = "~",  ["ẩ"] = "?",  ["ậ"] = ".",
	["ă"] = "",  ["ắ"] = "'",  ["ằ"] = "`",  ["ẵ"] = "~",  ["ẳ"] = "?",  ["ặ"] = ".",
	["e"] = "",  ["é"] = "'",  ["è"] = "`",  ["ẽ"] = "~",  ["ẻ"] = "?",  ["ẹ"] = ".",
	["ê"] = "",  ["ế"] = "'",  ["ề"] = "`",  ["ễ"] = "~",  ["ể"] = "?",  ["ệ"] = ".",
	["i"] = "",  ["í"] = "'",  ["ì"] = "`",  ["ĩ"] = "~",  ["ỉ"] = "?",  ["ị"] = ".",
	["o"] = "",  ["ó"] = "'",  ["ò"] = "`",  ["õ"] = "~",  ["ỏ"] = "?",  ["ọ"] = ".",
	["ô"] = "",  ["ố"] = "'",  ["ồ"] = "`",  ["ỗ"] = "~",  ["ổ"] = "?",  ["ộ"] = ".",
	["ơ"] = "",  ["ớ"] = "'",  ["ờ"] = "`",  ["ỡ"] = "~",  ["ở"] = "?",  ["ợ"] = ".",
	["u"] = "",  ["ú"] = "'",  ["ù"] = "`",  ["ũ"] = "~",  ["ủ"] = "?",  ["ụ"] = ".",
	["ư"] = "",  ["ứ"] = "'",  ["ừ"] = "`",  ["ữ"] = "~",  ["ử"] = "?",  ["ự"] = ".",
	["y"] = "",  ["ý"] = "'",  ["ỳ"] = "`",  ["ỹ"] = "~",  ["ỷ"] = "?",  ["ỵ"] = "."
}

---Receives a word and returns a copy of the word without tone marks.
function vi2ipa.detone(word)
	return word:gsub(unichar_pattern, vowelsToBases)
end

---Returns the VIQR representation of the given glide-vowel-glide sequence’s tone.
function vi2ipa.viqrTone(gvg)
	for character in gvg:gmatch(unichar_pattern) do
		if vowelsToVIQRTones[character] then
			local tone = vowelsToVIQRTones[character]
			-- Immediately return non-ngang tones.
			if #tone > 0 then return tone end
		end
	end
	-- Fall back on the ngang tone.
	return ""
end

---Returns a breakdown of the given word.
function vi2ipa.components(word)
	-- Initial consonant cluster
	-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieC]]
	local ci = {}
	local initialConsonants = "bcdfghjklmnpqrstvwxz"
	for letter in word:gmatch(unichar_pattern) do
		if (#letter == 1 and initialConsonants:find(letter)) or letter == "đ" then
			table.insert(ci, letter)
		else break end
	end
	ci = table.concat(ci)
	
	-- Interior glide-vowel-glide sequence
	-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieV]]
	local gvg = {}
	for letter in word:sub(1 + #ci):gmatch(unichar_pattern) do
		if vowelsToBases[letter] then
			table.insert(gvg, letter)
		else break end
	end
	gvg = table.concat(gvg)
	
	-- Final consonant cluster
	local cf = {}
	local finalConsonants = "cghmnpt"
	for letter in word:sub(1 + #ci + #gvg):gmatch(unichar_pattern) do
		if #letter == 1 and finalConsonants:find(letter) then
			table.insert(cf, letter)
		else break end
	end
	cf = table.concat(cf)
	
	-- TODO: Support polysyllabic words.
	assert(#ci + #gvg + #cf == #word, "Polysyllabic loan words not yet implemented: " ..
		ci .. "+" .. gvg .. "+" .. cf .. "≠" .. #word)
	
	-- Semisyllables, like in “H'Mông”
	if #ci > 0 and #gvg < 1 and #cf < 1 then gvg = "ờ" end
	
	-- Tone
	local t = vi2ipa.viqrTone(gvg)
	
	return {ci = ci, gvg = gvg, cf = cf, t = t}
end

---Returns the IPA transcription of the given initial consonant cluster.
function vi2ipa.ciToIPA(ci, gvg, cf, dialect)
	local data = vi2ipa.dialects[dialect].initialConsonantsToIPA
	local ipa = data[ci] or
		-- Loanwords from some minority languages retain double consonants.
		data[ci:sub(1)] or data[ci:sub(2)]
	if type(ipa) == "function" then ipa = ipa(gvg, cf) end
	return ipa or ""
end

---Returns the IPA transcription of the given glide-vowel-glide sequence.
function vi2ipa.gvgToIPA(ci, gvg, cf, t, dialect)
	local gvgData = vi2ipa.dialects[dialect].interiorToIPA
	local toneAttributes = vi2ipa.dialects[dialect].toneAttributes[t] or {}
	local ipa = gvgData[gvg] or gvgData[vi2ipa.detone(gvg)] or ""
	if type(ipa) == "function" then ipa = ipa(ci, cf) or "" end
	
	-- Insert glottal stop.
	if toneAttributes.glottal then
		if toneAttributes.repeated then
			ipa = ipa:gsub(unichar_pattern .. "_ː?", "%1_ʔ%1_")
		else ipa = ipa:gsub(unichar_pattern .. "_(ː?)", "%1_%2ʔ") end
	end
	
	-- Insert breathy-voice diacritic.
	if toneAttributes.breathy then ipa = ipa:gsub("_", "\204\164_") -- U+0324
	-- Or insert creaky-voice diacritic.
	elseif toneAttributes.creaky then ipa = ipa:gsub("_", "\204\176_") -- U+0330
	end
	
	return ipa:gsub("_", "")
end

---Returns the IPA transcription of the given final consonant cluster.
function vi2ipa.cfToIPA(ci, gvg, cf, dialect)
	local ipa = vi2ipa.dialects[dialect].finalConsonantsToIPA[cf]
	if type(ipa) == "function" then ipa = ipa(ci, gvg) end
	return ipa or ""
end

---Returns the IPA tone letters for the given word.
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieTn]] and [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieT]]
function vi2ipa.viqrToneToIPA(word, dialect)
	local viqr = vi2ipa.viqrTone(word)
	if not viqr then return end
	return vi2ipa.dialects[dialect].viqrTonesToIPA[viqr]
end

---Returns the IPA transcription of the given Vietnamese text.
-- @usage {{#invoke:vi2ipa|ipa|tiếng Việt}}
function vi2ipa.ipa(frame)
	return vi2ipa._ipa(frame.args.text or frame.args[1] or "",
		frame.args.dialect or frame.args[2] or "Hà Nội")
end
function vi2ipa._ipa(text, dialect)
	local ipa = {}
	for word in vi2ipa.lower(text):gmatch("([a-z" .. lower .. "]+)") do
		local c = vi2ipa.components(word)
		table.insert(ipa,
			vi2ipa.ciToIPA(c.ci, c.gvg, c.cf, dialect) ..
			vi2ipa.gvgToIPA(c.ci, c.gvg, c.cf, c.t, dialect) ..
			vi2ipa.cfToIPA(c.ci, c.gvg, c.cf, dialect) ..
			"<span style='color: #808080;'>" ..
				vi2ipa.viqrToneToIPA(word, dialect) ..
			"</span>")
	end
	return table.concat(ipa, " ")
end

---Returns an HTML table row with one header cell for each supported dialect.
-- @usage {{#invoke:vi2ipa|tableColumnHeaders}}
function vi2ipa.tableColumnHeaders()
	local headers = {"<th>Từ</th>"}
	for i = 1, #dialects do
		table.insert(headers, "<th>" .. dialects[i] .. "</th>")
	end
	return "<tr>" .. table.concat(headers) .. "</tr>"
end

---Returns an HTML table row of IPA transcriptions of the given word in all the
-- supported dialects. Adjacent, identical table cells are combined.
-- @usage {{#invoke:vi2ipa|tableRow|tiếng Việt}}
function vi2ipa.tableRow(frame)
	return vi2ipa._tableRow(frame.args.word or frame.args[1])
end
function vi2ipa._tableRow(word)
	local cells = {}
	local colspans = {}
	for i = 1, #dialects do
		cells[i] = vi2ipa._ipa(word, dialects[i])
		colspans[i] = 1
	end
	for i = #cells, 1, -1 do
		if cells[i] == cells[i - 1] then
			colspans[i - 1] = colspans[i - 1] + colspans[i]
			table.remove(cells, i)
			table.remove(colspans, i)
		end
	end
	for i = 1, #cells do
		local colspan = ""
		if colspans[i] > 1 then colspan = " colspan='" .. colspans[i] .. "'" end
		cells[i] = "<td" .. colspan ..">" .. cells[i] .. "</td>"
	end
	return "<tr><th scope='row'>" .. word .. "</th>" .. table.concat(cells) .. "</tr>"
end

return vi2ipa