local utils = require "luacheck.utils" -- Lexer should support syntax of Lua 5.1, Lua 5.2, Lua 5.3 and LuaJIT(64bit and complex cdata literals). local lexer = {} local sbyte = string.byte local schar = string.char local sreverse = string.reverse local tconcat = table.concat local mfloor = math.floor -- No point in inlining these, fetching a constant ~= fetching a local. local BYTE_0, BYTE_9, BYTE_f, BYTE_F = sbyte("0"), sbyte("9"), sbyte("f"), sbyte("F") local BYTE_x, BYTE_X, BYTE_i, BYTE_I = sbyte("x"), sbyte("X"), sbyte("i"), sbyte("I") local BYTE_l, BYTE_L, BYTE_u, BYTE_U = sbyte("l"), sbyte("L"), sbyte("u"), sbyte("U") local BYTE_e, BYTE_E, BYTE_p, BYTE_P = sbyte("e"), sbyte("E"), sbyte("p"), sbyte("P") local BYTE_a, BYTE_z, BYTE_A, BYTE_Z = sbyte("a"), sbyte("z"), sbyte("A"), sbyte("Z") local BYTE_DOT, BYTE_COLON = sbyte("."), sbyte(":") local BYTE_OBRACK, BYTE_CBRACK = sbyte("["), sbyte("]") local BYTE_OBRACE, BYTE_CBRACE = sbyte("{"), sbyte("}") local BYTE_QUOTE, BYTE_DQUOTE = sbyte("'"), sbyte('"') local BYTE_PLUS, BYTE_DASH, BYTE_LDASH = sbyte("+"), sbyte("-"), sbyte("_") local BYTE_SLASH, BYTE_BSLASH = sbyte("/"), sbyte("\\") local BYTE_EQ, BYTE_NE = sbyte("="), sbyte("~") local BYTE_LT, BYTE_GT = sbyte("<"), sbyte(">") local BYTE_LF, BYTE_CR = sbyte("\n"), sbyte("\r") local BYTE_SPACE, BYTE_FF, BYTE_TAB, BYTE_VTAB = sbyte(" "), sbyte("\f"), sbyte("\t"), sbyte("\v") local function to_hex(b) if BYTE_0 <= b and b <= BYTE_9 then return b-BYTE_0 elseif BYTE_a <= b and b <= BYTE_f then return 10+b-BYTE_a elseif BYTE_A <= b and b <= BYTE_F then return 10+b-BYTE_A else return nil end end local function to_dec(b) if BYTE_0 <= b and b <= BYTE_9 then return b-BYTE_0 else return nil end end local function to_utf(codepoint) if codepoint < 0x80 then -- ASCII? return schar(codepoint) end local buf = {} local mfb = 0x3F repeat buf[#buf+1] = schar(codepoint % 0x40 + 0x80) codepoint = mfloor(codepoint / 0x40) mfb = mfloor(mfb / 2) until codepoint <= mfb buf[#buf+1] = schar(0xFE - mfb*2 + codepoint) return sreverse(tconcat(buf)) end local function is_alpha(b) return (BYTE_a <= b and b <= BYTE_z) or (BYTE_A <= b and b <= BYTE_Z) or b == BYTE_LDASH end local function is_newline(b) return (b == BYTE_LF) or (b == BYTE_CR) end local function is_space(b) return (b == BYTE_SPACE) or (b == BYTE_FF) or (b == BYTE_TAB) or (b == BYTE_VTAB) end local keywords = utils.array_to_set({ "and", "break", "do", "else", "elseif", "end", "false", "for", "function", "goto", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"}) local simple_escapes = { [sbyte("a")] = sbyte("\a"), [sbyte("b")] = sbyte("\b"), [sbyte("f")] = sbyte("\f"), [sbyte("n")] = sbyte("\n"), [sbyte("r")] = sbyte("\r"), [sbyte("t")] = sbyte("\t"), [sbyte("v")] = sbyte("\v"), [BYTE_BSLASH] = BYTE_BSLASH, [BYTE_QUOTE] = BYTE_QUOTE, [BYTE_DQUOTE] = BYTE_DQUOTE } local function next_byte(state) local offset = state.offset + 1 state.offset = offset return state.src:get_codepoint(offset) end -- Skipping helpers. -- Take the current character, skip something, return next character. local function skip_newline(state, newline) local first_newline_offset = state.offset local b = next_byte(state) if b ~= newline and is_newline(b) then b = next_byte(state) end local line = state.line local line_offsets = state.line_offsets state.line_lengths[line] = first_newline_offset - line_offsets[line] line = line + 1 state.line = line line_offsets[line] = state.offset return b end local function skip_to_newline(state, b) while not is_newline(b) and b do b = next_byte(state) end return b end local function skip_space(state, b) while is_space(b) or is_newline(b) do if is_newline(b) then b = skip_newline(state, b) else b = next_byte(state) end end return b end -- Skips "[=*" or "]=*". Returns next character and number of "="s. local function skip_long_bracket(state) local start = state.offset local b = next_byte(state) while b == BYTE_EQ do b = next_byte(state) end return b, state.offset-start-1 end -- Token handlers. -- Called after the opening "[=*" has been skipped. -- Takes number of "=" in the opening bracket and token type(comment or string). local function lex_long_string(state, opening_long_bracket, token) local b = next_byte(state) if is_newline(b) then b = skip_newline(state, b) end local lines = {} local line_start = state.offset while true do if is_newline(b) then -- Add the finished line. lines[#lines+1] = state.src:get_substring(line_start, state.offset-1) b = skip_newline(state, b) line_start = state.offset elseif b == BYTE_CBRACK then local long_bracket b, long_bracket = skip_long_bracket(state) if b == BYTE_CBRACK and long_bracket == opening_long_bracket then break end elseif b == nil then return nil, token == "string" and "unfinished long string" or "unfinished long comment" else b = next_byte(state) end end -- Add last line. lines[#lines+1] = state.src:get_substring(line_start, state.offset-opening_long_bracket-2) state.offset = state.offset + 1 return token, tconcat(lines, "\n") end local function lex_short_string(state, quote) local b = next_byte(state) local chunks -- Buffer is only required when there are escape sequences. local chunk_start = state.offset while b ~= quote do if b == BYTE_BSLASH then -- Escape sequence. if not chunks then -- This is the first escape sequence, init buffer. chunks = {} end -- Put previous chunk into buffer. if chunk_start ~= state.offset then chunks[#chunks+1] = state.src:get_substring(chunk_start, state.offset-1) end b = next_byte(state) -- The final string escape sequence evaluates to. local s local escape_byte = simple_escapes[b] if escape_byte then -- Is it a simple escape sequence? b = next_byte(state) s = schar(escape_byte) elseif is_newline(b) then b = skip_newline(state, b) s = "\n" elseif b == BYTE_x then -- Hexadecimal escape. b = next_byte(state) -- Skip "x". -- Exactly two hexadecimal digits. local c1, c2 if b then c1 = to_hex(b) end if not c1 then return nil, "invalid hexadecimal escape sequence", -2 end b = next_byte(state) if b then c2 = to_hex(b) end if not c2 then return nil, "invalid hexadecimal escape sequence", -3 end b = next_byte(state) s = schar(c1*16 + c2) elseif b == BYTE_u then b = next_byte(state) -- Skip "u". if b ~= BYTE_OBRACE then return nil, "invalid UTF-8 escape sequence", -2 end b = next_byte(state) -- Skip "{". local codepoint -- There should be at least one digit. if b then codepoint = to_hex(b) end if not codepoint then return nil, "invalid UTF-8 escape sequence", -3 end local hexdigits = 0 while true do b = next_byte(state) local hex if b then hex = to_hex(b) end if hex then hexdigits = hexdigits + 1 codepoint = codepoint*16 + hex if codepoint > 0x10FFFF then -- UTF-8 value too large. return nil, "invalid UTF-8 escape sequence", -hexdigits-3 end else break end end if b ~= BYTE_CBRACE then return nil, "invalid UTF-8 escape sequence", -hexdigits-4 end b = next_byte(state) -- Skip "}". s = to_utf(codepoint) elseif b == BYTE_z then -- Zap following span of spaces. b = skip_space(state, next_byte(state)) else -- Must be a decimal escape. local cb if b then cb = to_dec(b) end if not cb then return nil, "invalid escape sequence", -1 end -- Up to three decimal digits. b = next_byte(state) if b then local c2 = to_dec(b) if c2 then cb = 10*cb + c2 b = next_byte(state) if b then local c3 = to_dec(b) if c3 then cb = 10*cb + c3 if cb > 255 then return nil, "invalid decimal escape sequence", -3 end b = next_byte(state) end end end end s = schar(cb) end if s then chunks[#chunks+1] = s end -- Next chunk starts after escape sequence. chunk_start = state.offset elseif b == nil or is_newline(b) then return nil, "unfinished string" else b = next_byte(state) end end -- Offset now points at the closing quote. local string_value if chunks then -- Put last chunk into buffer. if chunk_start ~= state.offset then chunks[#chunks+1] = state.src:get_substring(chunk_start, state.offset-1) end string_value = tconcat(chunks) else -- There were no escape sequences. string_value = state.src:get_substring(chunk_start, state.offset-1) end -- Skip the closing quote. state.offset = state.offset + 1 return "string", string_value end -- Payload for a number is simply a substring. -- Luacheck is supposed to be forward-compatible with Lua 5.3 and LuaJIT syntax, so -- parsing it into actual number may be problematic. -- It is not needed currently anyway as Luacheck does not do static evaluation yet. local function lex_number(state, b) local start = state.offset local exp_lower, exp_upper = BYTE_e, BYTE_E local is_digit = to_dec local has_digits = false local is_float = false if b == BYTE_0 then b = next_byte(state) if b == BYTE_x or b == BYTE_X then exp_lower, exp_upper = BYTE_p, BYTE_P is_digit = to_hex b = next_byte(state) else has_digits = true end end while b ~= nil and is_digit(b) do b = next_byte(state) has_digits = true end if b == BYTE_DOT then -- Fractional part. is_float = true b = next_byte(state) -- Skip dot. while b ~= nil and is_digit(b) do b = next_byte(state) has_digits = true end end if b == exp_lower or b == exp_upper then -- Exponent part. is_float = true b = next_byte(state) -- Skip optional sign. if b == BYTE_PLUS or b == BYTE_DASH then b = next_byte(state) end -- Exponent consists of one or more decimal digits. if b == nil or not to_dec(b) then return nil, "malformed number" end repeat b = next_byte(state) until b == nil or not to_dec(b) end if not has_digits then return nil, "malformed number" end -- Is it cdata literal? if b == BYTE_i or b == BYTE_I then -- It is complex literal. Skip "i" or "I". state.offset = state.offset + 1 else -- uint64_t and int64_t literals can not be fractional. if not is_float then if b == BYTE_u or b == BYTE_U then -- It may be uint64_t literal. local b1 = state.src:get_codepoint(state.offset+1) if b1 == BYTE_l or b1 == BYTE_L then local b2 = state.src:get_codepoint(state.offset+2) if b2 == BYTE_l or b2 == BYTE_L then -- It is uint64_t literal. state.offset = state.offset + 3 end end elseif b == BYTE_l or b == BYTE_L then -- It may be uint64_t or int64_t literal. local b1 = state.src:get_codepoint(state.offset+1) if b1 == BYTE_l or b1 == BYTE_L then local b2 = state.src:get_codepoint(state.offset+2) if b2 == BYTE_u or b2 == BYTE_U then -- It is uint64_t literal. state.offset = state.offset + 3 else -- It is int64_t literal. state.offset = state.offset + 2 end end end end end return "number", state.src:get_substring(start, state.offset-1) end local function lex_ident(state) local start = state.offset local b = next_byte(state) while (b ~= nil) and (is_alpha(b) or to_dec(b)) do b = next_byte(state) end local ident = state.src:get_substring(start, state.offset-1) if keywords[ident] then return ident else return "name", ident end end local function lex_dash(state) local b = next_byte(state) -- Is it "-" or comment? if b ~= BYTE_DASH then return "-" end -- It is a comment. b = next_byte(state) local start = state.offset -- Is it a long comment? if b == BYTE_OBRACK then local long_bracket b, long_bracket = skip_long_bracket(state) if b == BYTE_OBRACK then return lex_long_string(state, long_bracket, "long_comment") end end -- Short comment. skip_to_newline(state, b) local comment_value = state.src:get_substring(start, state.offset - 1) return "short_comment", comment_value end local function lex_bracket(state) -- Is it "[" or long string? local b, long_bracket = skip_long_bracket(state) if b == BYTE_OBRACK then return lex_long_string(state, long_bracket, "string") elseif long_bracket == 0 then return "[" else return nil, "invalid long string delimiter" end end local function lex_eq(state) local b = next_byte(state) if b == BYTE_EQ then state.offset = state.offset + 1 return "==" else return "=" end end local function lex_lt(state) local b = next_byte(state) if b == BYTE_EQ then state.offset = state.offset + 1 return "<=" elseif b == BYTE_LT then state.offset = state.offset + 1 return "<<" else return "<" end end local function lex_gt(state) local b = next_byte(state) if b == BYTE_EQ then state.offset = state.offset + 1 return ">=" elseif b == BYTE_GT then state.offset = state.offset + 1 return ">>" else return ">" end end local function lex_div(state) local b = next_byte(state) if b == BYTE_SLASH then state.offset = state.offset + 1 return "//" else return "/" end end local function lex_ne(state) local b = next_byte(state) if b == BYTE_EQ then state.offset = state.offset + 1 return "~=" else return "~" end end local function lex_colon(state) local b = next_byte(state) if b == BYTE_COLON then state.offset = state.offset + 1 return "::" else return ":" end end local function lex_dot(state) local b = next_byte(state) if b == BYTE_DOT then b = next_byte(state) if b == BYTE_DOT then state.offset = state.offset + 1 return "...", "..." else return ".." end elseif b and to_dec(b) then -- Backtrack to dot. state.offset = state.offset - 2 return lex_number(state, next_byte(state)) else return "." end end local function lex_any(state, b) state.offset = state.offset + 1 if b > 255 then b = 255 end return schar(b) end -- Maps first bytes of tokens to functions that handle them. -- Each handler takes the first byte as an argument. -- Each handler stops at the character after the token and returns the token and, -- optionally, a value associated with the token. -- On error handler returns nil, error message and, optionally, start of reported location as negative offset. local byte_handlers = { [BYTE_DOT] = lex_dot, [BYTE_COLON] = lex_colon, [BYTE_OBRACK] = lex_bracket, [BYTE_QUOTE] = lex_short_string, [BYTE_DQUOTE] = lex_short_string, [BYTE_DASH] = lex_dash, [BYTE_SLASH] = lex_div, [BYTE_EQ] = lex_eq, [BYTE_NE] = lex_ne, [BYTE_LT] = lex_lt, [BYTE_GT] = lex_gt, [BYTE_LDASH] = lex_ident } for b=BYTE_0, BYTE_9 do byte_handlers[b] = lex_number end for b=BYTE_a, BYTE_z do byte_handlers[b] = lex_ident end for b=BYTE_A, BYTE_Z do byte_handlers[b] = lex_ident end -- Creates and returns lexer state for source. function lexer.new_state(src, line_offsets, line_lengths) local state = { src = src, line = 1, line_offsets = line_offsets or {}, line_lengths = line_lengths or {}, offset = 1 } state.line_offsets[1] = 1 if src:get_length() >= 2 and src:get_substring(1, 2) == "#!" then -- Skip shebang line. state.offset = 2 skip_to_newline(state, next_byte(state)) end return state end function lexer.get_quoted_substring_or_line(state, line, offset, end_offset) local line_length = state.line_lengths[line] if line_length then local line_end_offset = state.line_offsets[line] + line_length - 1 if line_end_offset < end_offset then end_offset = line_end_offset end end return "'" .. state.src:get_printable_substring(offset, end_offset) .. "'" end -- Looks for next token starting from state.line, state.offset. -- Returns next token, its value and its location (line, offset). -- Sets state.line, state.offset to token end location + 1. -- Fills state.line_offsets and state.line_lengths. -- On error returns nil, error message, error location (line, offset), error end offset. function lexer.next_token(state) local line_offsets = state.line_offsets local b = skip_space(state, state.src:get_codepoint(state.offset)) -- Save location of token start. local token_line = state.line local line_offset = line_offsets[token_line] local token_offset = state.offset if not b then -- EOF token has length 1. state.offset = state.offset + 1 state.line_lengths[token_line] = token_offset - line_offset return "eof", nil, token_line, token_offset end local token, token_value, relative_error_offset = (byte_handlers[b] or lex_any)(state, b) if relative_error_offset then -- Error relative to current offset. local error_offset = state.offset + relative_error_offset local error_end_offset = math.min(state.offset, state.src:get_length()) local error_message = token_value .. " " .. lexer.get_quoted_substring_or_line(state, state.line, error_offset, error_end_offset) return nil, error_message, state.line, error_offset, error_end_offset end -- Single character errors fall through here. return token, token_value, token_line, token_offset, not token and token_offset end return lexer