Jump to content

Module:Unicode

Unchecked
From Wikipedia

Documentation for this module may be created at Module:Unicode/doc

---A collection of functions for working with Unicode character data.
-- This module is not intended to provide general string processing
-- functionality.

require "bit"
require "hex"

local unicode = {}

---Formats the given number as a hexidecimal Unicode codepoint.
-- @param decNum a number representing a codepoint
-- @returns a codepoint reference in the form “U+xxxx”
function tohex(decNum)
    local hexNum = hex.to_hex(decNum):sub(3)
    return "U+" .. string.rep(0, 4 - #hexNum) .. hexNum
end
---@usage {{#invoke: Unicode |tohex|119070}}
function unicode.tohex(frame)
    return tohex(frame.args.num or frame.args[1])
end

---Returns the given character’s Unicode codepoint expressed as a number. If
-- more than one character is given, only the first character is considered.
-- @param char a UTF-8–encoded character
-- @returns a Unicode codepoint expressed as a number
function codepoint(char, formatted)
    local strBytes = {string.byte(char, 1, #char)}
    
    -- Single-byte sequences are easy; just return the byte.
    local headBits = bit.tobits(strBytes[1])
    if #headBits < 8 then
	if formatted then return tohex(strBytes[1])
	else return strBytes[1] end
    end
    
    -- Get the number of bytes in the character from the leading byte.
    local numBytes = 0
    for i = #headBits, 1, -1 do
	if headBits[i] == 1 then numBytes = numBytes + 1 else break end
    end
    
    -- Copy the continuation bytes into a bit array in low-endian order.
    local charBits = {}
    for i = numBytes, 2, -1 do
	local bits = bit.tobits(strBytes[i])
	-- Ignore the highest two bits.
	for j = 1, #bits - 2 do
	    table.insert(charBits, bits[j])
	end
    end
    
    -- Copy the low end of the leading byte into the bit array.
    for i = 1, #headBits - numBytes - 1 do
	table.insert(charBits, headBits[i])
    end
    
    -- Convert the bit array into a number.
    local codepoint = bit.tonumb(charBits)
    if formatted then return tohex(codepoint)
    else return codepoint end
end
---@usage {{#invoke: Unicode |codepoint|€}}
function unicode.codepoint(frame)
    return codepoint(frame.args.char or frame.args[1], frame.args.formatted)
end

return unicode