Module:Vi2ipa 2
Appearance
Documentation for this module may be created at Module:Vi2ipa 2/doc
---Transcribes a Vietnamese word or compound word into IPA. Supports
-- pronunciations in some of the main dialects of Vietnam.
--
-- This module is intended as a replacement for the lumbering monstrosity
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron]], which itself is a replacement for the
-- editor-unfriendly [[Wikipedia:wikt:vi:Bản mẫu:VieIPA]].
local vi2ipa = {}
vi2ipa.dialects = require("Module:Vi2ipa 2/dialects")
local dialects = {
"Hà Nội", "Hải Phòng",
"Vinh", "Thanh Chương", "Hà Tĩnh",
"Huế",
"Quy Nhơn", "Sài Gòn"
}
---Pattern that captures UTF-8 characters into an array.
-- http://lua-users.org/wiki/LuaUnicode
local unichar_pattern = "([%z\1-\127\194-\244][\128-\191]*)"
local upper = "ĐÂĂÊÔƠƯÁẤẮÉẾÍÓỐỚÚỨÝÀẦẰÈỀÌÒỒỜÙỪỲẢẨẲẺỂỈỎỔỞỦỬỶÃẪẴẼỄĨÕỖỠŨỮỸẠẬẶẸỆỊỌỘỢỤỰỴ"
local lower = "đâăêôơưáấắéếíóốớúứýàầằèềìòồờùừỳảẩẳẻểỉỏổởủửỷãẫẵẽễĩõỗỡũữỹạậặẹệịọộợụựỵ"
local upperToLower = {
["Đ"] = "đ",
["A"] = "a", ["Á"] = "á", ["À"] = "à", ["Ã"] = "ã", ["Ả"] = "ả", ["Ạ"] = "ạ",
["Â"] = "â", ["Ấ"] = "ấ", ["Ầ"] = "ầ", ["Ẫ"] = "ẫ", ["Ẩ"] = "ẩ", ["Ậ"] = "ậ",
["Ă"] = "ă", ["Ắ"] = "ắ", ["Ằ"] = "ằ", ["Ẵ"] = "ẵ", ["Ẳ"] = "ẳ", ["Ặ"] = "ặ",
["E"] = "e", ["É"] = "é", ["È"] = "è", ["Ẽ"] = "ẽ", ["Ẻ"] = "ẻ", ["Ẹ"] = "ẹ",
["Ê"] = "ê", ["Ế"] = "ế", ["Ề"] = "ề", ["Ễ"] = "ễ", ["Ể"] = "ể", ["Ệ"] = "ệ",
["I"] = "i", ["Í"] = "í", ["Ì"] = "ì", ["Ĩ"] = "ĩ", ["Ỉ"] = "ỉ", ["Ị"] = "ị",
["O"] = "o", ["Ó"] = "ó", ["Ò"] = "ò", ["Õ"] = "õ", ["Ỏ"] = "ỏ", ["Ọ"] = "ọ",
["Ô"] = "ô", ["Ố"] = "ố", ["Ồ"] = "ồ", ["Ỗ"] = "ỗ", ["Ổ"] = "ổ", ["Ộ"] = "ộ",
["Ơ"] = "ơ", ["Ớ"] = "ớ", ["Ờ"] = "ờ", ["Ỡ"] = "ỡ", ["Ở"] = "ở", ["Ợ"] = "ợ",
["U"] = "u", ["Ú"] = "ú", ["Ù"] = "ù", ["Ũ"] = "ũ", ["Ủ"] = "ủ", ["Ụ"] = "ụ",
["Ư"] = "ư", ["Ứ"] = "ứ", ["Ừ"] = "ừ", ["Ữ"] = "ữ", ["Ử"] = "ử", ["Ự"] = "ự",
["Y"] = "y", ["Ý"] = "ý", ["Ỳ"] = "ỳ", ["Ỹ"] = "ỹ", ["Ỷ"] = "ỷ", ["Ỵ"] = "ỵ"
}
setmetatable(upperToLower, {
__index = function (name, u)
name[u] = u:lower()
return name[u]
end
})
---Returns a copy of the given string with all uppercase Vietnamese letters
-- changed to lowercase.
-- @see string.lower
function vi2ipa.lower(s)
return s:gsub(unichar_pattern, upperToLower)
end
---Table mapping vowel characters to their toneless base letters.
local vowelsToBases = {
["a"] = "a", ["á"] = "a", ["à"] = "a", ["ã"] = "a", ["ả"] = "a", ["ạ"] = "a",
["â"] = "â", ["ấ"] = "â", ["ầ"] = "â", ["ẫ"] = "â", ["ẩ"] = "â", ["ậ"] = "â",
["ă"] = "ă", ["ắ"] = "ă", ["ằ"] = "ă", ["ẵ"] = "ă", ["ẳ"] = "ă", ["ặ"] = "ă",
["e"] = "e", ["é"] = "e", ["è"] = "e", ["ẽ"] = "e", ["ẻ"] = "e", ["ẹ"] = "e",
["ê"] = "ê", ["ế"] = "ê", ["ề"] = "ê", ["ễ"] = "ê", ["ể"] = "ê", ["ệ"] = "ê",
["i"] = "i", ["í"] = "i", ["ì"] = "i", ["ĩ"] = "i", ["ỉ"] = "i", ["ị"] = "i",
["o"] = "o", ["ó"] = "o", ["ò"] = "o", ["õ"] = "o", ["ỏ"] = "o", ["ọ"] = "o",
["ô"] = "ô", ["ố"] = "ô", ["ồ"] = "ô", ["ỗ"] = "ô", ["ổ"] = "ô", ["ộ"] = "ô",
["ơ"] = "ơ", ["ớ"] = "ơ", ["ờ"] = "ơ", ["ỡ"] = "ơ", ["ở"] = "ơ", ["ợ"] = "ơ",
["u"] = "u", ["ú"] = "u", ["ù"] = "u", ["ũ"] = "u", ["ủ"] = "u", ["ụ"] = "u",
["ư"] = "ư", ["ứ"] = "ư", ["ừ"] = "ư", ["ữ"] = "ư", ["ử"] = "ư", ["ự"] = "ư",
["y"] = "y", ["ý"] = "y", ["ỳ"] = "y", ["ỹ"] = "y", ["ỷ"] = "y", ["ỵ"] = "y"
}
---Table mapping vowel characters to the VIQR representation of their tones.
local vowelsToVIQRTones = {
["a"] = "", ["á"] = "'", ["à"] = "`", ["ã"] = "~", ["ả"] = "?", ["ạ"] = ".",
["â"] = "", ["ấ"] = "'", ["ầ"] = "`", ["ẫ"] = "~", ["ẩ"] = "?", ["ậ"] = ".",
["ă"] = "", ["ắ"] = "'", ["ằ"] = "`", ["ẵ"] = "~", ["ẳ"] = "?", ["ặ"] = ".",
["e"] = "", ["é"] = "'", ["è"] = "`", ["ẽ"] = "~", ["ẻ"] = "?", ["ẹ"] = ".",
["ê"] = "", ["ế"] = "'", ["ề"] = "`", ["ễ"] = "~", ["ể"] = "?", ["ệ"] = ".",
["i"] = "", ["í"] = "'", ["ì"] = "`", ["ĩ"] = "~", ["ỉ"] = "?", ["ị"] = ".",
["o"] = "", ["ó"] = "'", ["ò"] = "`", ["õ"] = "~", ["ỏ"] = "?", ["ọ"] = ".",
["ô"] = "", ["ố"] = "'", ["ồ"] = "`", ["ỗ"] = "~", ["ổ"] = "?", ["ộ"] = ".",
["ơ"] = "", ["ớ"] = "'", ["ờ"] = "`", ["ỡ"] = "~", ["ở"] = "?", ["ợ"] = ".",
["u"] = "", ["ú"] = "'", ["ù"] = "`", ["ũ"] = "~", ["ủ"] = "?", ["ụ"] = ".",
["ư"] = "", ["ứ"] = "'", ["ừ"] = "`", ["ữ"] = "~", ["ử"] = "?", ["ự"] = ".",
["y"] = "", ["ý"] = "'", ["ỳ"] = "`", ["ỹ"] = "~", ["ỷ"] = "?", ["ỵ"] = "."
}
---Receives a word and returns a copy of the word without tone marks.
function vi2ipa.detone(word)
return word:gsub(unichar_pattern, vowelsToBases)
end
---Returns the VIQR representation of the given glide-vowel-glide sequence’s tone.
function vi2ipa.viqrTone(gvg)
for character in gvg:gmatch(unichar_pattern) do
if vowelsToVIQRTones[character] then
local tone = vowelsToVIQRTones[character]
-- Immediately return non-ngang tones.
if #tone > 0 then return tone end
end
end
-- Fall back on the ngang tone.
return ""
end
---Returns a breakdown of the given word.
function vi2ipa.components(word)
-- Initial consonant cluster
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieC]]
local ci = {}
local initialConsonants = "bcdfghjklmnpqrstvwxz"
for letter in word:gmatch(unichar_pattern) do
if (#letter == 1 and initialConsonants:find(letter)) or letter == "đ" then
table.insert(ci, letter)
else break end
end
ci = table.concat(ci)
-- Interior glide-vowel-glide sequence
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieV]]
local gvg = {}
for letter in word:sub(1 + #ci):gmatch(unichar_pattern) do
if vowelsToBases[letter] then
table.insert(gvg, letter)
else break end
end
gvg = table.concat(gvg)
-- Final consonant cluster
local cf = {}
local finalConsonants = "cghmnpt"
for letter in word:sub(1 + #ci + #gvg):gmatch(unichar_pattern) do
if #letter == 1 and finalConsonants:find(letter) then
table.insert(cf, letter)
else break end
end
cf = table.concat(cf)
-- TODO: Support polysyllabic words.
assert(#ci + #gvg + #cf == #word, "Polysyllabic loan words not yet implemented: " ..
ci .. "+" .. gvg .. "+" .. cf .. "≠" .. #word)
-- Semisyllables, like in “H'Mông”
if #ci > 0 and #gvg < 1 and #cf < 1 then gvg = "ờ" end
-- Tone
local t = vi2ipa.viqrTone(gvg)
return {ci = ci, gvg = gvg, cf = cf, t = t}
end
---Returns the IPA transcription of the given initial consonant cluster.
function vi2ipa.ciToIPA(ci, gvg, cf, dialect)
local data = vi2ipa.dialects[dialect].initialConsonantsToIPA
local ipa = data[ci] or
-- Loanwords from some minority languages retain double consonants.
data[ci:sub(1)] or data[ci:sub(2)]
if type(ipa) == "function" then ipa = ipa(gvg, cf) end
return ipa or ""
end
---Returns the IPA transcription of the given glide-vowel-glide sequence.
function vi2ipa.gvgToIPA(ci, gvg, cf, t, dialect)
local gvgData = vi2ipa.dialects[dialect].interiorToIPA
local toneAttributes = vi2ipa.dialects[dialect].toneAttributes[t] or {}
local ipa = gvgData[gvg] or gvgData[vi2ipa.detone(gvg)] or ""
if type(ipa) == "function" then ipa = ipa(ci, cf) or "" end
-- Insert glottal stop.
if toneAttributes.glottal then
if toneAttributes.repeated then
ipa = ipa:gsub(unichar_pattern .. "_ː?", "%1_ʔ%1_")
else ipa = ipa:gsub(unichar_pattern .. "_(ː?)", "%1_%2ʔ") end
end
-- Insert breathy-voice diacritic.
if toneAttributes.breathy then ipa = ipa:gsub("_", "\204\164_") -- U+0324
-- Or insert creaky-voice diacritic.
elseif toneAttributes.creaky then ipa = ipa:gsub("_", "\204\176_") -- U+0330
end
return ipa:gsub("_", "")
end
---Returns the IPA transcription of the given final consonant cluster.
function vi2ipa.cfToIPA(ci, gvg, cf, dialect)
local ipa = vi2ipa.dialects[dialect].finalConsonantsToIPA[cf]
if type(ipa) == "function" then ipa = ipa(ci, gvg) end
return ipa or ""
end
---Returns the IPA tone letters for the given word.
-- [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieTn]] and [[Wikipedia:wikt:vi:Bản mẫu:vie-pron/VieT]]
function vi2ipa.viqrToneToIPA(word, dialect)
local viqr = vi2ipa.viqrTone(word)
if not viqr then return end
return vi2ipa.dialects[dialect].viqrTonesToIPA[viqr]
end
---Returns the IPA transcription of the given Vietnamese text.
-- @usage {{#invoke:vi2ipa|ipa|tiếng Việt}}
function vi2ipa.ipa(frame)
return vi2ipa._ipa(frame.args.text or frame.args[1] or "",
frame.args.dialect or frame.args[2] or "Hà Nội")
end
function vi2ipa._ipa(text, dialect)
local ipa = {}
for word in vi2ipa.lower(text):gmatch("([a-z" .. lower .. "]+)") do
local c = vi2ipa.components(word)
table.insert(ipa,
vi2ipa.ciToIPA(c.ci, c.gvg, c.cf, dialect) ..
vi2ipa.gvgToIPA(c.ci, c.gvg, c.cf, c.t, dialect) ..
vi2ipa.cfToIPA(c.ci, c.gvg, c.cf, dialect) ..
"<span style='color: #808080;'>" ..
vi2ipa.viqrToneToIPA(word, dialect) ..
"</span>")
end
return table.concat(ipa, " ")
end
---Returns an HTML table row with one header cell for each supported dialect.
-- @usage {{#invoke:vi2ipa|tableColumnHeaders}}
function vi2ipa.tableColumnHeaders()
local headers = {"<th>Từ</th>"}
for i = 1, #dialects do
table.insert(headers, "<th>" .. dialects[i] .. "</th>")
end
return "<tr>" .. table.concat(headers) .. "</tr>"
end
---Returns an HTML table row of IPA transcriptions of the given word in all the
-- supported dialects. Adjacent, identical table cells are combined.
-- @usage {{#invoke:vi2ipa|tableRow|tiếng Việt}}
function vi2ipa.tableRow(frame)
return vi2ipa._tableRow(frame.args.word or frame.args[1])
end
function vi2ipa._tableRow(word)
local cells = {}
local colspans = {}
for i = 1, #dialects do
cells[i] = vi2ipa._ipa(word, dialects[i])
colspans[i] = 1
end
for i = #cells, 1, -1 do
if cells[i] == cells[i - 1] then
colspans[i - 1] = colspans[i - 1] + colspans[i]
table.remove(cells, i)
table.remove(colspans, i)
end
end
for i = 1, #cells do
local colspan = ""
if colspans[i] > 1 then colspan = " colspan='" .. colspans[i] .. "'" end
cells[i] = "<td" .. colspan ..">" .. cells[i] .. "</td>"
end
return "<tr><th scope='row'>" .. word .. "</th>" .. table.concat(cells) .. "</tr>"
end
return vi2ipa