Jump to content

Module:ja-ruby

ຈາກ Wiktionary

Documentation for this module may be created at Module:ja-ruby/doc

local export = {}

local function str_hira_to_kata(s)
    return (mw.ustring.gsub(s, '[ぁ-ゖ]', function(m1) return mw.ustring.char(mw.ustring.codepoint(m1) + 96) end))
end
local function str_kata_to_hira(s)
    return (mw.ustring.gsub(s, '[ァ-ヶ]', function(m1) return mw.ustring.char(mw.ustring.codepoint(m1) - 96) end))
end

local function str_ucompare(s1, s2, limit) -- using Wagner–Fischer algorithm
    s1, s2 = mw.text.split(s1, ''), mw.text.split(s2, '')
    local len1, len2 = #s1, #s2
    if limit and len1 * len2 * 20 > limit then return {{s1}, {s2}} end

    local m_cost, m_step = {{0}}, {{}}
    for i = 1, len1 do
        m_cost[i + 1] = {i}
        m_step[i + 1] = {3}
    end
    for j = 1, len2 do
        m_cost[1][j + 1] = j
        m_step[1][j + 1] = 4
    end

    for i = 1, len1 do
        for j = 1, len2 do
            local b_same = s1[i] == s2[j]
            local c_sub = m_cost[i][j] + (b_same and 0 or 1)
            local c_del = m_cost[i][j + 1] + 1
            local c_ins = m_cost[i + 1][j] + 1
            if c_sub <= c_del and c_sub <= c_ins then
                m_cost[i + 1][j + 1] = c_sub
                m_step[i + 1][j + 1] = b_same and 1 or 2
            elseif c_del <= c_ins then
                m_cost[i + 1][j + 1] = c_del
                m_step[i + 1][j + 1] = 3
            else
                m_cost[i + 1][j + 1] = c_ins
                m_step[i + 1][j + 1] = 4
            end
        end
    end

    local i1, i2 = len1 + 1, len2 + 1
    local m_offset = {{-1, -1}, {-1, -1}, {-1, 0}, {0, -1}}
    local r_step_rev = {}
    local r_step_rev_pos1 = {}
    local r_step_rev_pos2 = {}
    local step = m_step[i1][i2]
    while step do
        table.insert(r_step_rev, step)
        i1 = i1 + m_offset[step][1]
        i2 = i2 + m_offset[step][2]
        table.insert(r_step_rev_pos1, i1)
        table.insert(r_step_rev_pos2, i2)
        step = m_step[i1][i2]
    end

    local r1, r2 = {}, {}
    local i = #r_step_rev
    local step = r_step_rev[i]
    while i > 0 do
        local r1_f, r2_f = {}, {}
        if step == 1 then
            repeat
                table.insert(r1_f, s1[r_step_rev_pos1[i]])
                table.insert(r2_f, s2[r_step_rev_pos2[i]])
                i = i - 1
                step = r_step_rev[i]
            until step ~= 1
        else
            while true do
                if step == 2 then
                    table.insert(r1_f, s1[r_step_rev_pos1[i]])
                    table.insert(r2_f, s2[r_step_rev_pos2[i]])
                elseif step == 3 then
                    table.insert(r1_f, s1[r_step_rev_pos1[i]])
                elseif step == 4 then
                    table.insert(r2_f, s2[r_step_rev_pos2[i]])
                else
                    break
                end
                i = i - 1
                step = r_step_rev[i]
            end
        end
        table.insert(r1, table.concat(r1_f))
        table.insert(r2, table.concat(r2_f))
    end
    return {r1, r2}
end

local function str_parse_link(s)
    local t = {}
    local lt
    local i1, i2
    local i_o = 1
    local i_n = s:find('%[%[', i_o)
    while i_n do
        i1, i2 = s:find('%[%[', i_n + 1), s:find('%]%]', i_n + 2)
        if not i2 then break end
        while i1 and i1 < i2 do
            i_n = i1
            i1 = s:find('%[%[', i_n + 1)
        end
        if i_o < i_n then table.insert(t, {
            text = s:sub(i_o, i_n - 1),
        }) end
        if i_n + 2 < i2 then
            lt = s:sub(i_n + 2, i2 - 1)
            i1 = lt:find('|')
            if i1 and i1 > 1 and i1 < lt:len() then
                table.insert(t, {
                    text = lt:sub(i1 + 1),
                    linkto = lt:sub(1, i1 - 1),
                })
            else
                table.insert(t, {
                    text = lt,
                    linkto = lt,
                })
            end
        end
        i_o = i2 + 2
        i_n = s:find('%[%[', i_o)
    end
    if i_o <= #s then table.insert(t, {
        text = s:sub(i_o),
    }) end
    return t
end

local function table_merge(link_table, ruby_table)
    local r = {}
    local r_sub, r_insert
    local len_cut
    local id_l, id_r = 1, 1
    local nn = false
    while id_l <= #link_table and id_r <= #ruby_table do
        len_cut = link_table[id_l].text:len() - ruby_table[id_r].text:len()
        if ruby_table[id_r].ruby and (ruby_table[id_r].ruby:find'%[%[..-%]%]' or len_cut < 0) then
            if ruby_table[id_r].ruby then
                r_sub = {
                    text = {},
                    ruby = str_parse_link(ruby_table[id_r].ruby),
                }
                r_insert = r_sub.text
                table.insert(r, r_sub)
            else
                r_insert = r
            end
            while len_cut < 0 do
                table.insert(r_insert, {
                    text = link_table[id_l].text,
                    linkto = link_table[id_l].linkto
                })
                id_l = id_l + 1
                len_cut = len_cut + link_table[id_l].text:len()
            end
            table.insert(r_insert, {
                text = link_table[id_l].text:sub(1, -1 - len_cut),
                linkto = link_table[id_l].linkto
            })
            if len_cut == 0 then
                id_l = id_l + 1
                id_r = id_r + 1
            else
                link_table[id_l].text = link_table[id_l].text:sub(-len_cut)
                id_r = id_r + 1
            end
        else
            if link_table[id_l].linkto then
                r_sub = {
                    text = {},
                    linkto = link_table[id_l].linkto,
                }
                r_insert = r_sub.text
                table.insert(r, r_sub)
            else
                r_insert = r
            end
            while len_cut > 0 and not (ruby_table[id_r].ruby and ruby_table[id_r].ruby:find'%[%[..-%]%]') do
                table.insert(r_insert, {
                    text = ruby_table[id_r].text,
                    ruby = ruby_table[id_r].ruby,
                })
                id_r = id_r + 1
                len_cut = len_cut - ruby_table[id_r].text:len()
            end
            if len_cut == 0 then
                table.insert(r_insert, {
                    text = ruby_table[id_r].text,
                    ruby = ruby_table[id_r].ruby,
                })
                id_l = id_l + 1
                id_r = id_r + 1
            else
                if ruby_table[id_r].ruby then
                    link_table[id_l].text = link_table[id_l].text:sub(-(len_cut + ruby_table[id_r].text:len()))
                else
                    table.insert(r_insert, {
                        text = ruby_table[id_r].text:sub(1, -1 + len_cut),
                    })
                    ruby_table[id_r].text = ruby_table[id_r].text:sub(len_cut)
                    id_l = id_l + 1
                end
            end
        end
    end
    return r
end

function export.len_text(ruby_table)
    local r = 0
    for _, v in ipairs(ruby_table) do
        v = v.text
        r = r + (type(v) == 'string' and mw.ustring.len(v) or export.len_text(v))
    end
    return r
end

function export.len_ruby(ruby_table)
    local r = 0
    for _, v in ipairs(ruby_table) do
        v = v.ruby or v.text
        r = r + (type(v) == 'string' and mw.ustring.len(v) or export.len_ruby(v))
    end
    return r
end

function export.to_text(ruby_table)
    local r = {}
    local v_text
    for _, v in ipairs(ruby_table) do
        v_text = v.text
        if type(v_text) == 'string' then
            table.insert(r, v_text)
        else
            table.insert(r, export.to_text(v_text))
        end
    end
    return table.concat(r)
end

function export.to_ruby(ruby_table)
    local r = {}
    local v_text
    for _, v in ipairs(ruby_table) do
        v_text = v.ruby or v.text
        if type(v_text) == 'string' then
            table.insert(r, v_text)
        else
            table.insert(r, export.to_ruby(v_text))
        end
    end
    return table.concat(r)
end

local function table_to_markup(ruby_table, break_link, lb, lm, lf, rb, rm, rf)
    local text = {}
    local v_text, v_ruby, v_linkto
    for _, v in ipairs(ruby_table) do
        v_linkto, v_ruby = v.linkto, v.ruby
        if type(v.text) ~= 'string' then
            if break_link and v_linkto then
                v_text = {}
                for _, vv in ipairs(v.text) do
                    if vv.text ~= '' or vv.ruby and vv.ruby ~= '' then
                        table.insert(v_text, {
                            text = {{
                                text = vv.text,
                                linkto = v_linkto,
                            }},
                            ruby = vv.ruby,
                        })
                    end
                end
                v_linkto, v_ruby = nil, nil
                v_text = table_to_markup(v_text, break_link, lb, lm, lf, rb, rm, rf)
            else
                v_text = table_to_markup(v.text, break_link, lb, lm, lf, rb, rm, rf)
            end
        else
            v_text = v.text
        end
        if v_linkto then
            if v_linkto ~= '' then table.insert(text, lb .. v_linkto .. lm .. (v_text ~= '' and v_text or '_') .. lf)
            else table.insert(text, v_text) end
        elseif v_ruby then
            if type(v_ruby) ~= 'string' then v_ruby = table_to_markup(v_ruby, break_link, lb, lm, lf, rb, rm, rf) end
            if v_ruby ~= '' then table.insert(text, rb .. v_text .. rm .. v_ruby .. rf)
            else table.insert(text, v_text) end
        else
            table.insert(text, v_text)
        end
    end
    return table.concat(text)
end

-- "options.markup": Use custom markups. See below.
-- "options.break_link = true": Change [[...|<ruby>...<ruby>]] to <ruby>[[...]]<ruby>.
function export.to_markup(ruby_table, options)
    options = options or {}
    omarkup = options.markup or {}

    return table_to_markup(
        ruby_table,
        options.break_link,
        omarkup.link_border_left or '[[',
        omarkup.link_border_middle or '|',
        omarkup.link_border_right or ']]',
        omarkup.ruby_border_left or '[',
        omarkup.ruby_border_middle or '](',
        omarkup.ruby_border_right or ')')
end

-- The options are the same as "function export.to_markup"
function export.to_wiki(ruby_table, options)
    options = options or {}
    omarkup = options.markup or {}

    return table_to_markup(
        ruby_table,
        options.break_link,
        omarkup.link_border_left or '[[',
        omarkup.link_border_middle or '|',
        omarkup.link_border_right or ']]',
        omarkup.ruby_border_left or '<ruby>',
        omarkup.ruby_border_middle or '<rp>(</rp><rt>',
        omarkup.ruby_border_right or '</rt><rp>)</rp></ruby>')
end

function export.parse_markup(markup)
    local ruby = {}
    local link_table = str_parse_link(markup:gsub('(%b[])(%b())', function(m1, m2)
        table.insert(ruby, m2:sub(2, -2))
        return m1:sub(2, -2)
    end))

    local plain_text = export.to_text(str_parse_link(markup))
    local ruby_table = {}
    local p0 = 1
    local ruby_n = 1
    local s_text, s_ruby
    plain_text:gsub('()(%b[])(%b())()', function(p1, m1, m2, p2)
        if p0 < p1 then
            s_text = plain_text:sub(p0, p1 - 1)
            table.insert(ruby_table, {text = s_text})
        end
        if #m1 > 2 then
            s_text = m1:sub(2, -2)
            s_ruby = ruby[ruby_n]
            table.insert(ruby_table, {
                text = s_text,
                ruby = s_ruby ~= '' and s_ruby or nil,
            })
        end
        p0 = p2
        ruby_n = ruby_n + 1
    end)
    if p0 <= #plain_text then
        s_text = plain_text:sub(p0)
        table.insert(ruby_table, {text = s_text})
    end

    return table_merge(link_table, ruby_table)
end

-- "options.try == nil": Lauch an error when the initial match failed.
-- "options.try == 'return'": Return "nil, (error information)" when the initial match failed.
-- "options.try == 'force'": Try every possible pattern when the initial match failed.
-- "options.try_force_limit": Limit the time used by "options.try == 'force'".
-- "options.space == nil": Remove spaces between kana or kanji but preserve elsewhere.
-- "options.space == 'all'": Preserve all spaces.
-- "options.space == 'none'": Remove all spaces.
-- "options.allow_ruby_link == true": Try to match the links in the rubies.
function export.parse_text(term, kana, options)
	options = options or {}

	local pat_kana = 'ぁ-ゖァ-ヶー' -- signs subject to hira-kata matching
	local pat_kanji_probable = '々一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧﨨﨩0-9A-Za-z〆〇' -- signs that can have ruby, but not spaces
	local pat_rubiable_probable = '0-9a-zA-Zα-ωΑ-Ω' -- signs that can have both ruby and spaces
	local pat_mute_probable = '%^%-%.゠・' -- signs that may appear in term, but not kana

    local _remove_space
    if options.space == 'none' then
        _remove_space = function(_r)
            local function _next(p1, p2)
                if p2 and p2 < #_r[p1].text then
                    return p1, p2 + 1
                end
                p1 = p1 + 1
                if p1 > #_r then
                    p2 = nil
                else
                    p2 = type(_r[p1].text) ~= 'string' and 1 or nil
                end
                return p1, p2
            end
            local pos1, pos2 = _next(0, nil)
            while pos1 <= #_r do
                _t = pos2 and _r[pos1].text[pos2] or _r[pos1]
                _t.text = _t.text:gsub(' ', '')
                if _t.linkto then _t.linkto = _remove_space({{text = _t.linkto}})[1].text end
                if _t.ruby then _t.ruby = _remove_space({{text = _t.ruby}})[1].text end
                if pos2 then
                    if _r[pos1].linkto then _r[pos1].linkto = _remove_space({{text = _r[pos1].linkto}})[1].text end
                    if _r[pos1].ruby then _r[pos1].ruby = _remove_space({{text = _r[pos1].ruby}})[1].text end
                end
                pos1, pos2 = _next(pos1, pos2)
            end
            return _r
        end
    elseif options.space == 'all' then
        _remove_space = function(_r)
            return _r
        end
    else
        _remove_space = function(_r, context_ak, context_bk)
            local function _next(p1, p2)
                if p2 and p2 < #_r[p1].text then
                    return p1, p2 + 1
                end
                p1 = p1 + 1
                if p1 > #_r then
                    p2 = nil
                else
                    p2 = type(_r[p1].text) ~= 'string' and 1 or nil
                end
                return p1, p2
            end
            local pos1, pos2 = _next(0, nil)
            local pos3, pos4 = pos1, pos2
            local after_k = context_ak
            local before_k
            local _t, char
            while pos1 <= #_r do
                if pos3 == pos1 and (pos4 == pos2 or pos4 < pos2) or pos3 < pos1 then
                    before_k = context_bk
                    pos3, pos4 = _next(pos1, pos2)
                    while pos3 <= #_r do
                        _t = pos4 and _r[pos3].text[pos4] or _r[pos3]
                        char = mw.ustring.find(_t.text, '[^ \']')
                        if char then
                            char = mw.ustring.sub(_t.text, char, char)
                            before_k = mw.ustring.find(char, '^['..pat_kanji_probable..pat_kana..']$')
                            break
                        end
                        pos3, pos4 = _next(pos3, pos4)
                    end
                end

                _t = pos2 and _r[pos1].text[pos2] or _r[pos1]
                if _t.linkto then _t.linkto = _remove_space({{text = _t.linkto}}, after_k, before_k)[1].text end
                if _t.ruby then _t.ruby = _remove_space({{text = _t.ruby}}, after_k, before_k)[1].text end
                if pos2 then
                    if _r[pos1].linkto then _r[pos1].linkto = _remove_space({{text = _r[pos1].linkto}}, after_k, before_k)[1].text end
                    if _r[pos1].ruby then _r[pos1].ruby = _remove_space({{text = _r[pos1].ruby}}, after_k, before_k)[1].text end
                end

                local seg = {}
                local i0 = 1
                for i1, m1, i2 in mw.ustring.gmatch(_t.text, '()(['..pat_kanji_probable..pat_kana..']+)()') do
                    if after_k and not mw.ustring.sub(_t.text, i0, i1 - 1):find'[^ \']' then
                        table.insert(seg, (mw.ustring.sub(_t.text, i0, i1 - 1):gsub(' ', '')))
                    else
                        table.insert(seg, mw.ustring.sub(_t.text, i0, i1 - 1))
                    end
                    table.insert(seg, m1)
                    after_k = true
                    i0 = i2
                end
                after_k = after_k and not mw.ustring.sub(_t.text, i0):find'[^ \']'
                if after_k and before_k then
                    table.insert(seg, (mw.ustring.sub(_t.text, i0):gsub(' ', '')))
                else
                    table.insert(seg, mw.ustring.sub(_t.text, i0))
                end
                _t.text = table.concat(seg)

                pos1, pos2 = _next(pos1, pos2)
            end
            return _r
        end
    end

	-- Create the link table
	-- e.g. "[[エドガー・アラン・ポー|アラン・ポー]]の[[推理 小説]]"
	local link_table = str_parse_link(term:gsub('%%', '')) -- remove '%'
	--[[link_table = {
        {text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},
        {text = 'の'},
        {text = '推理 小説', linkto = '推理 小説'},
    }]]

	-- Remove romaji markup
	kana = kana:gsub('[%^%-%.]', '') -- remove '^', '-', '.', preserve '%', ' '

    -- Create the ruby table
	-- e.g. 'アラン・ポーの推理 小説', 'あらん ぽー の すいり しょうせつ'
	-- ("ぽお" is not allowed)
	local ruby_table = {}
    local plain_term_raw = export.to_text(str_parse_link(term)) -- Remove links: [[A|B]] -> B, [[C]] -> C
    local plain_kana_raw = options.allow_ruby_link and kana or export.to_text(str_parse_link(kana))
	local plain_term = mw.text.split(plain_term_raw, '%%')
	local plain_kana = mw.text.split(plain_kana_raw, '%%')
    if #plain_term ~= #plain_kana then
        mw.logObject(plain_term)
        mw.logObject(plain_kana)
        error('Separator "%" in the kanji and kana strings do not match.')
    end
    for i, plain_term_i in ipairs(plain_term) do
        if plain_term ~= '' or plain_kana[i] ~= '' then
            local pattern_ruby, pattern_ruby_is_ruby = {}, {}
            local function _func_pat(s_sub)
                local in_xml_tag = false
                table.insert(pattern_ruby, '(' .. mw.ustring.gsub(s_sub, '.', function(m0)
                    if in_xml_tag then
                        if m0 == '>' then in_xml_tag = false end
                        return ''
                    else
                        if m0 == '<' then
                            in_xml_tag = true
                            return ' ?<.->'
                        else
                            local m0_m = m0
                            if m0:find'^[%(%)%.%%%+%-%*%?%[%]%^%$]$' then m0_m = '%' .. m0_m end
                            if mw.ustring.find(m0, '^['..pat_mute_probable..']$') then m0_m = '[' .. m0_m .. ' -]?'
                            elseif mw.ustring.find(m0, '^[ヶゖケ]$') then
                                m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "かがこカガコ]"
                            elseif mw.ustring.find(m0, '^['..pat_kana..']$') then
                                m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "]"
                            end
                            return ' ?' .. m0_m
                        end
                    end
                end) .. ' ?)')
            end
            local plain_term_noxml = plain_term_i:gsub('%b<>', '<>')
            local pos0 = 1
            for pos1, pos2 in mw.ustring.gmatch(plain_term_noxml, '()['..pat_kanji_probable..pat_rubiable_probable..']+()') do
                if pos0 < pos1 then _func_pat(mw.ustring.sub(plain_term_noxml, pos0, pos1 - 1)) end
                if not pattern_ruby_is_ruby[#pattern_ruby] then
                    table.insert(pattern_ruby, '(..-)')
                    pattern_ruby_is_ruby[#pattern_ruby] = true
                end
                pos0 = pos2
            end
            plain_term_noxml = mw.ustring.sub(plain_term_noxml, pos0)
            if #pattern_ruby == 0 and not mw.ustring.find(plain_term_noxml, '['..pat_kana..']') then
                -- if a "non-rubiable" and "non-kana" string is isolated by %, it matches anything.
                table.insert(pattern_ruby, '(.-)')
                pattern_ruby_is_ruby[#pattern_ruby] = true
            else
                if #plain_term_noxml > 0 then _func_pat(plain_term_noxml) end
            end
            local pat_ruby_s = table.concat(pattern_ruby)
            -- 'アラン・ポーの推理 小説' to '( ?[あア] ?[らラ] ?[んン] ?[・ -]? ?[ぽポ] ?ー ?[のノ] ?)(..-)( )(..-)'
            -- Excute matching
            local ruby_table_i_ruby = {mw.ustring.match(plain_kana[i], '^'..pat_ruby_s..'$')}
            if #ruby_table_i_ruby > 0 then
                local ruby_table_i_text = {mw.ustring.match(plain_term_i, '^'..pat_ruby_s..'$')}
                for n_match = 1, #pattern_ruby do
                    if pattern_ruby_is_ruby[n_match] and ruby_table_i_text[n_match] ~= ruby_table_i_ruby[n_match] then
                        table.insert(ruby_table, {
                            text = ruby_table_i_text[n_match],
                            ruby = ruby_table_i_ruby[n_match],
                        })
                    else
                        if #ruby_table > 0 and ruby_table[#ruby_table].ruby == nil then
                            ruby_table[#ruby_table].text = ruby_table[#ruby_table].text .. ruby_table_i_text[n_match]
                        else
                            table.insert(ruby_table, {text = ruby_table_i_text[n_match]})
                        end
                    end
                end
            elseif options.try == 'force' then
                require('Module:debug').track('ja-ruby/forced match')
                local forced_result = str_ucompare(plain_term_i, plain_kana[i], options.try_force_limit)
                for ii, vv in ipairs(forced_result[1]) do
                    table.insert(ruby_table, {
                        text = vv,
                        ruby = forced_result[2][ii] ~= vv and forced_result[2][ii] or nil,
                    })
                end
            elseif options.try == 'return' then
                return nil, 'Can not match "' .. plain_term_i .. '" and "' .. plain_kana[i] .. '".'
            else
                mw.log(pat_ruby_s)
                error('Can not match "' .. plain_term_i .. '" and "' .. plain_kana[i] .. '"')
            end
        end
    end
	--[[ruby_table = {
        {text = 'アラン・ポーの'},
        {text = '推理', ruby = 'すいり'},
        {text = ' '}
        {text = '小説', ruby = 'しょうせつ'},
    }]]

    return _remove_space(table_merge(link_table, ruby_table))
	-- Merge the ruby and link table
    --[[return {
        {text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},
        {text = 'の'},
        {text = {
            {text = '推理', ruby = 'すいり'},
            {text = ''}
            {text = '小説', ruby = 'しょうせつ'},
        }, linkto = '推理小説'},
    }]]
end

-- shortcut
function export.ruby_auto(args)
    local to_target
    if args.target == 'text' then
        to_target = export.to_text
    elseif args.target == 'ruby' then
        to_target = export.to_ruby
    elseif args.target == 'markup' then
        to_target = export.to_markup
    else
        to_target = export.to_wiki
    end

    if args.term and args.kana then
        local result, err = export.parse_text(args.term, args.kana, args.options)
        if result then
            return to_target(result, args.options)
        else
            return result, err
        end
    elseif args.markup then
        return to_target(export.parse_markup(args.markup, args.options), args.options)
    else
        error('Cannot find "term" and "kana" or "markup"')
    end
end

return export