Module:Multilingual description

local p = {}

--[==[ Remap some "valid" language codes that are still unknown, but are known by another code in order to get a visible language name (and if possible, BCP 47 conformance)! --]==] local remappedLanguages = { ['als'] = 'gsw', -- known code (unfortunately broken in Wikimedia) to changed new code (also known, but conforming to BCP 47); 'als' on Wikimedia for Norman conflicts with the standard 'als' which actually refers to the unrelated stardard variant of the Albanian language. ['nrm'] = 'nrf', -- known code (unfortunately broken in Wikimedia) to changed new code (also known, but conforming to BCP 47); 'nrm' on Wikimedia for Norman conflicts with the standard 'nrm' which actually refers to the unrelated Narom language

['bat-smg'] = 'sgs', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki ['be-x-old'] = 'be-tarask', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki ['fiu-vro'] = 'vro', -- legacy interwiki still supported, newer conforming code now supported as well in interwiki ['roa-rup'] = 'rup', -- code not conforming to BCP 47 (legacy interwiki still supported, newer code supported as well), the new standard code should be used (and is now recognized as interwiki) ['roa-tara'] = 'nap-taran', -- code not conforming to BCP 47 (continental variant of Napolitan), should be replaced by a conforming variant code ['zh-classical'] = 'lzh', -- code not conforming to BCP 47, replaced by standard code also supported in interwiki

['bh'] = 'bho', -- legacy interwiki still supported but ambiguous as a family, newer code now supported as well in interwiki; 'bh' was used in Wikimedia to refer to Bhojpuri only and not the whole Bihari family ['bu'] = 'my', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki ['iw'] = 'he', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki ['jw'] = 'jv', -- legacy code from ISO 639 deprecated, newer code is prefered and used as interwiki ['zh-min-nan'] = 'nan', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki ['zh-wuu'] = 'wuu', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki ['zh-yue'] = 'yue', -- legacy BCP 47 code, valid but deprecated in favor of new code also supported in interwiki

-- Standard BCP 47 codes that can still not be used in MediaWiki, and must be replaced for now by valid BCP 47 codes (ignoring the specific variant).

['en-us'] = 'en', -- both codes are conforming and supported, only the second one is known (there's no support for the US variant which is implicit, unlike variants in CA, GB, IN, ZA) ['fa-af'] = 'fa', -- both codes are conforming and supported, only the second one is known, actually means "Eastern Dari" ['fr-x-galo'] = 'fr', -- both codes are conforming and supported, only the second one is known ['ha-latn'] = 'ha', -- both codes are conforming and supported, only the second one is known, the Latin script is the default since the 1950's   ['ha-arab'] = 'ha', -- both codes are conforming and supported, only the second one is known, the Arabic script is historic, without clear orthography ['ko-kr'] = 'ko', -- both codes are conforming and supported, only the second one is known, regional variant used in South Korea (kr-kp for the variant in North Korea is supported) ['ku-latn'] = 'ku', -- both codes are conforming and supported, only the second one is known, the Latin script is the default since the 1950's   ['ku-cyrl'] = 'ku', -- both codes are conforming and supported, only the second one is known, the Cyrillic script is still used ['no'] = 'nb', -- both codes are conforming and supported, but the 1st one is now used only for meaning the second one in MediaWiki ['prd'] = 'fa', -- both codes are conforming and supported, only the second one is known; 'prd' is "Parsi-Dari", and means the same as 'fa-af' ['tgl'] = 'tl', -- both codes are conforming and supported, only the second one is known (but may have alternate forms written in the Tagal script and not Latin for modern Filipino)

-- Standard BCP 47 codes for variants that can still not be used in MediaWiki, and must be replaced for now by legacy codes, valid only in Wikimedia wikis but not conforming to BCP 47.

['sr-cyrl'] = 'sr-ec', -- this alternate known code is non-standard and in fact not supported, but has a correct native name ['sr-latn'] = 'sr-el', -- same remark }

local sortedKnownLanguageTags = require('Module:Multilingual description/sort') local dir = require('Module:Dir').select

local function addDescription(descriptions, lang, description, update) if type(description) == 'string' then if mw.text.trim(description):len > 0 then table.insert(descriptions, mw.getCurrentFrame:expandTemplate{               title = 'Ls',                args = {                    lang,                    description,                    dir = dir(lang, 'rtl', 'ltr'),                    classes = 'description',                    update = update                }            }) end end end

-- Kind is either 'deprecated', 'conflicting', or 'unsupported'. local function addTracking(descriptions, kind) table.insert(descriptions, '') end

local function _mld(args) --[==[   Shallow copy of arguments (because keys in args cannot be unset if args is hollow, in a parent    frame outside Lua). DO NOT copy the metatable that exposes only a *read-only* interface with accessors to PHP arrays (mw.clone does NOT work)! --]==]   local descriptions, conflicting = {}, false for lang, description in pairs(args) do       if type(lang) == 'string' and type(description) == 'string' then --[==[           MediaWiki trims the names of named argument and their values, but does not remove HTML comments in these names (some Mld contain parameters like "| sk = ...") or "nowiki" tags. After removing them, we still need to trim the rest in language codes and in descriptions, to           detect conflicting descriptions for the same language code. --]==]           description = description :gsub('<!%-%-.-%-%->', '') :gsub('', ):gsub('', ):gsub('', '') :gsub('^%s*(.-)%s*$','%1') --[==[           Split multiple language codes (or default) assigned with the same description. Valid BCP 47 language codes contain only ASCII letters, digits, hyphens or           underscores: canonicalize them to lowercase with hyphens replacing underscores (other characters are considered separators between language codes). --]==]           for code in lang :gsub('<!%-%-.-%-%->', '') :gsub('', ):gsub('', ):gsub('', '') :gsub('_', '-'):lower:gmatch("([%-0-9a-z]+)") do               --[==[ Detect conflicting descriptions, like "|en,default=OK|en=Bad" (whose result is unpredictable, as all keys are in random order). --]==]               if descriptions[code] ~= nil and descriptions[code] ~= description then conflicting = true end descriptions[code] = description end end end args, descriptions = descriptions, {} --[==[ Remap legacy language codes if there's no conflict. ]==]   local remapped = false for cur, alt in pairs(remappedLanguages) do       if args[cur] and not(mw.language.isSupportedLanguage(cur) and mw.language.isKnownLanguageTag(cur)) and (mw.language.isSupportedLanguage(alt) and mw.language.isKnownLanguageTag(alt)) then if args[alt] == nil then -- only if this does not conflict args[alt] = args[cur] -- set description for the alternate known language else remapped = true -- signal only in case of conflict conflicting = true end args[cur] = nil -- unset the description for the initial language code end end --[==[ First all known languages in order if they have description. ]==]   for _, lang in ipairs(sortedKnownLanguageTags) do        if args[lang] ~= nil then addDescription(descriptions, lang, args[lang], nil) args[lang] = nil end end --[==[ Append other unknown languages, but only if they are supported. ]==]   local unsupported = false for lang, description in pairs(args) do       if mw.language.isSupportedLanguage(lang) then addDescription(descriptions, lang, description, nil) else addDescription(descriptions, lang, description, lang) unsupported = true end end if conflicting then addTracking(descriptions, 'conflicting') end if remapped then addTracking(descriptions, 'deprecated') end if unsupported then addTracking(descriptions, 'unsupported') end --mw.logObject(descriptions) return table.concat(descriptions) end

function p.mld(frame) local args = (frame:getParent or {}).args or {} return _mld(args) end

setmetatable(p, {quickTests = function   local input = {        [1] = 'One?', -- discarded (no support for language numeric keys)        unsupported = 'What?', -- unsupported        en = ' ', -- empty description after trimming (discarded)        als = 'GSW', -- will be remapped        ['en-gb '] = 'EN-GB', -- trimming at end        ['en-ca '] = 'EN-CA',        [' de'] = 'DE', -- trimming at start        fr = 'FR',        [' fr '] = 'FR', -- trimming both ends (description not conflicting)        rue = 'RUE',        ru = 'RU',        ko = 'KO',        ja = 'JA',        zh = 'ZH',        ['he,iw'] = 'HE', -- 'iw' remapped to 'he' (description not conflicting)        ur = 'UR',        ar = 'AR',        ro = 'RO',        ['be-tarask'] = 'BE-TARASK',        ['be-x-old'] = 'BE-X-OLD (deprecated)',        dv = 'DV',    }    local expect = {}    --[==[    This is the exact order to expect according to native language names, and after discarding empty descriptions or unsupported language codes. --]==]   addDescription(expect, 'gsw', 'GSW') --Alemannisch -- remapped addDescription(expect, 'en-gb', 'EN-GB') --British English addDescription(expect, 'en-ca', 'EN-CA') --Canadian English addDescription(expect, 'de', 'DE') --Deutsch addDescription(expect, 'fr', 'FR') --français addDescription(expect, 'ro', 'RO') --română addDescription(expect, 'be-tarask', 'BE-TARASK') --беларуская (тарашкевіца) -- addDescription(expect, 'be-x-old', 'BE-X-OLD (deprecated)') --беларуская (тарашкевіца) -- discarded due to conflict addDescription(expect, 'rue', 'RUE') --русиньскый addDescription(expect, 'ru', 'RU') --русский addDescription(expect, 'ko', 'KO') --한국어 addDescription(expect, 'ja', 'JA') --日本語 addDescription(expect, 'zh', 'ZH') --中文 addDescription(expect, 'he', 'HE') --עברית addDescription(expect, 'ur', 'UR') --اردو addDescription(expect, 'ar', 'AR') --العربية addDescription(expect, 'dv', 'DV') --ދިވެހިބަސް --[==[   Note that unknown/unsorted languages may occur here in unpredictable order at end of this list, but only if they are "supported" (other   will be discarded). So we can only test for the presence of one such item. --]==]   addDescription(expect, 'unsupported', 'What?', 'unsupported') addTracking(expect, 'conflicting') addTracking(expect, 'deprecated') addTracking(expect, 'unsupported') expect = table.concat(expect) local actual = _mld(input) if (actual ~= expect) then mw.log('expect:\n' .. expect) mw.log('actual:\n' .. actual) return false end return true end}) --[==[ Type this to run tests in the Lua console: =getmetatable(p).quickTests -- should return true --]==] return p