WMSPrjInstance/HH-0169_GuoKeHengXiangZhiNengCangChu.git

local unicode = require "luacheck.unicode"
local utils = require "luacheck.utils"
 
local decoder = {}
 
local sbyte = string.byte
local sfind = string.find
local sgsub = string.gsub
local ssub = string.sub
 
-- `LatinChars` and `UnicodeChars` objects represent source strings
-- and provide Unicode-aware access to them with a common interface.
-- Source bytes should not be accessed directly.
-- Provided methods are:
-- `Chars:get_codepoint(index)`: returns codepoint at given index as integer or nil if index is out of range.
-- `Chars:get_substring(from, to)`: returns substring of original bytes corresponding to characters from `from` to `to`.
-- `Chars:get_printable_substring(from. to)`: like get_substring but escapes not printable characters.
-- `Chars:get_length()`: returns total number of characters.
-- `Chars:find(pattern, from)`: `string.find` but `from` is in characters. Return values are still in bytes.
 
-- `LatinChars` is an optimized special case for latin1 strings.
local LatinChars = utils.class()
 
function LatinChars:__init(bytes)
   self._bytes = bytes
end
 
function LatinChars:get_codepoint(index)
   return sbyte(self._bytes, index)
end
 
function LatinChars:get_substring(from, to)
   return ssub(self._bytes, from, to)
end
 
local function hexadecimal_escaper(byte)
   return ("\\x%02X"):format(sbyte(byte))
end
 
function LatinChars:get_printable_substring(from, to)
   return (sgsub(ssub(self._bytes, from, to), "[^\32-\126]", hexadecimal_escaper))
end
 
function LatinChars:get_length()
   return #self._bytes
end
 
function LatinChars:find(pattern, from)
   return sfind(self._bytes, pattern, from)
end
 
-- Decodes `bytes` as UTF8. Returns arrays of codepoints as integers and their byte offsets.
-- Byte offsets have one extra item pointing to one byte past the end of `bytes`.
-- On decoding error returns nothing.
local function get_codepoints_and_byte_offsets(bytes)
   local codepoints = {}
   local byte_offsets = {}
 
   local byte_index = 1
   local codepoint_index = 1
 
   while true do
      byte_offsets[codepoint_index] = byte_index
 
      -- Attempt to decode the next codepoint from UTF8.
      local codepoint = sbyte(bytes, byte_index)
 
      if not codepoint then
         return codepoints, byte_offsets
      end
 
      byte_index = byte_index + 1
 
      if codepoint >= 0x80 then
         -- Not ASCII.
 
         if codepoint < 0xC0 then
            return
         end
 
         local cont = (sbyte(bytes, byte_index) or 0) - 0x80
 
         if cont < 0 or cont >= 0x40 then
            return
         end
 
         byte_index = byte_index + 1
 
         if codepoint < 0xE0 then
            -- Two bytes.
            codepoint = cont + (codepoint - 0xC0) * 0x40
         elseif codepoint < 0xF0 then
            -- Three bytes.
            codepoint = cont + (codepoint - 0xE0) * 0x40
 
            cont = (sbyte(bytes, byte_index) or 0) - 0x80
 
            if cont < 0 or cont >= 0x40 then
               return
            end
 
            byte_index = byte_index + 1
 
            codepoint = cont + codepoint * 0x40
         elseif codepoint < 0xF8 then
            -- Four bytes.
            codepoint = cont + (codepoint - 0xF0) * 0x40
 
            cont = (sbyte(bytes, byte_index) or 0) - 0x80
 
            if cont < 0 or cont >= 0x40 then
               return
            end
 
            byte_index = byte_index + 1
 
            codepoint = cont + codepoint * 0x40
 
            cont = (sbyte(bytes, byte_index) or 0) - 0x80
 
            if cont < 0 or cont >= 0x40 then
               return
            end
 
            byte_index = byte_index + 1
 
            codepoint = cont + codepoint * 0x40
 
            if codepoint > 0x10FFFF then
               return
            end
         else
            return
         end
      end
 
      codepoints[codepoint_index] = codepoint
      codepoint_index = codepoint_index + 1
   end
end
 
-- `UnicodeChars` is the general case for non-latin1 strings.
-- Assumes UTF8, on decoding error falls back to latin1.
local UnicodeChars = utils.class()
 
function UnicodeChars:__init(bytes, codepoints, byte_offsets)
   self._bytes = bytes
   self._codepoints = codepoints
   self._byte_offsets = byte_offsets
end
 
function UnicodeChars:get_codepoint(index)
   return self._codepoints[index]
end
 
function UnicodeChars:get_substring(from, to)
   local byte_offsets = self._byte_offsets
   return ssub(self._bytes, byte_offsets[from], byte_offsets[to + 1] - 1)
end
 
function UnicodeChars:get_printable_substring(from, to)
   -- This is only called on syntax error, it's okay to be slow.
   local parts = {}
 
   for index = from, to do
      local codepoint = self._codepoints[index]
 
      if unicode.is_printable(codepoint) then
         table.insert(parts, self:get_substring(index, index))
      else
         table.insert(parts, (codepoint > 255 and "\\u{%X}" or "\\x%02X"):format(codepoint))
      end
   end
 
   return table.concat(parts)
end
 
function UnicodeChars:get_length()
   return #self._codepoints
end
 
function UnicodeChars:find(pattern, from)
   return sfind(self._bytes, pattern, self._byte_offsets[from])
end
 
function decoder.decode(bytes)
   -- Only use UnicodeChars if necessary. LatinChars isn't much faster but noticeably more memory efficient.
   if sfind(bytes, "[\128-\255]") then
      local codepoints, byte_offsets = get_codepoints_and_byte_offsets(bytes)
 
      if codepoints then
         return UnicodeChars(bytes, codepoints, byte_offsets)
      end
   end
 
   return LatinChars(bytes)
end
 
return decoder