From d6b654d355dfbe8cd0fce3e73eeacbd068419416 Mon Sep 17 00:00:00 2001
From: Rishikesh Vaishnav <rishhvaishnav@gmail.com>
Date: Wed, 15 Dec 2021 18:08:10 +0000
Subject: [PATCH 1/2] refactor(lsp): always crop matching text from end of
 incremental sync range

When constructing incremental updates to be sent to the server,
previously we only attempted to crop matching text from the end of
the old and new `on_lines` ranges on single-line updates.
This refactor makes it so that we always attempt to crop from the end,
no matter how many lines were changed. This reduces the total size of
increments sent to the server when possible by not including
text that wasn't changed at the end of the incremental update range.
---
 runtime/lua/vim/lsp/sync.lua                  | 127 +++++++++---------
 .../plugin/lsp/incremental_sync_spec.lua      |  18 +--
 2 files changed, 69 insertions(+), 76 deletions(-)

diff --git a/runtime/lua/vim/lsp/sync.lua b/runtime/lua/vim/lsp/sync.lua
index d01f45ad8f2a..c745fadf876e 100644
--- a/runtime/lua/vim/lsp/sync.lua
+++ b/runtime/lua/vim/lsp/sync.lua
@@ -93,31 +93,38 @@ end
 -- utf-8 index and either the utf-16, or utf-32 index.
 ---@param line string the line to index into
 ---@param byte integer the byte idx
+---@param start boolean true for start align, false for end align
 ---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
 ---@returns table<string, int> byte_idx and char_idx of first change position
-local function align_end_position(line, byte, offset_encoding)
-  local char
-  -- If on the first byte, or an empty string: the trivial case
-  if byte == 1 or #line == 0 then
-    char = byte
-  -- Called in the case of extending an empty line "" -> "a"
-  elseif byte == #line + 1 then
-    char = compute_line_length(line, offset_encoding) + 1
-  else
+local function align_position(line, byte, start, offset_encoding)
+  if byte ~= 1 and byte <= #line then
     -- Modifying line, find the nearest utf codepoint
     local offset = str_utf_start(line, byte)
+
     -- If the byte does not fall on the start of the character, then
-    -- align to the start of the next character.
+    -- align to the start of the next character if end align, and start
+    -- of this character if start align
     if offset < 0 then
-      byte = byte + str_utf_end(line, byte) + 1
-    end
-    if byte <= #line then
-      char = byte_to_utf(line, byte, offset_encoding)
-    else
-      char = compute_line_length(line, offset_encoding) + 1
+      if start then
+        byte = byte + offset
+      else
+        byte = byte + str_utf_end(line, byte) + 1
+      end
     end
-    -- Extending line, find the nearest utf codepoint for the last valid character
   end
+
+  local char
+
+  -- optimize for first byte case
+  if byte == 1 then
+    char = 1
+  -- Called in the case of extending an empty line "" -> "a"
+  elseif byte == #line + 1 then
+    char = compute_line_length(line, offset_encoding) + 1
+  else
+    char = byte_to_utf(line, byte, offset_encoding)
+  end
+
   return byte, char
 end
 
@@ -158,18 +165,7 @@ local function compute_start_range(prev_lines, curr_lines, firstline, lastline,
   end
 
   -- Convert byte to codepoint if applicable
-  local char_idx
-  local byte_idx
-  if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1)then
-    byte_idx = start_byte_idx
-    char_idx = 1
-  elseif start_byte_idx == #prev_line + 1 then
-    byte_idx = start_byte_idx
-    char_idx = compute_line_length(prev_line, offset_encoding)  + 1
-  else
-    byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
-    char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
-  end
+  local byte_idx, char_idx = align_position(prev_line, start_byte_idx, true, offset_encoding)
 
   -- Return the start difference (shared for new and prev lines)
   return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
@@ -210,51 +206,48 @@ local function compute_end_range(prev_lines, curr_lines, start_range, firstline,
   local prev_line_length = #prev_line
   local curr_line_length = #curr_line
 
-  local byte_offset = 0
+  local prev_line_range, curr_line_range
+  if start_line_idx == prev_line_idx then
+    prev_line_range = prev_line_length - start_range.byte_idx
+  -- start_line_idx < prev_line_idx
+  else
+    prev_line_range = prev_line_length - 1
+  end
+  if start_line_idx == curr_line_idx then
+    curr_line_range = curr_line_length - start_range.byte_idx
+  -- start_line_idx < curr_line_idx
+  else
+    curr_line_range = curr_line_length - 1
+  end
 
-  -- Editing the same line
-  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
-  if prev_line_idx == curr_line_idx then
-    local max_length
-    if start_line_idx == prev_line_idx then
-      -- Search until beginning of difference
-      max_length = min(prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx) + 1
-    else
-      max_length = min(prev_line_length, curr_line_length) + 1
-    end
-    for idx = 0, max_length do
-      byte_offset = idx
-      if
-        str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset)
-      then
-        break
-      end
+  -- Maximum number of bytes to search backwards for mismatch
+  local max_length = min(prev_line_range, curr_line_range)
+
+  -- Negative offset to last shared byte between prev_line and curr_line
+  -- -1 offset indicates no shared byte
+  local byte_offset = -1
+
+  -- Iterate from end to beginning of shortest line
+  for idx = 0, max_length do
+    byte_offset = idx
+    if
+      str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset)
+    then
+      -- If there was a mismatched byte, need to go back to next byte (which did match)
+      byte_offset = byte_offset - 1
+      break
     end
   end
 
-  -- Iterate from end to beginning of shortest line
-  local prev_end_byte_idx = prev_line_length - byte_offset + 1
+  local prev_end_byte_idx = prev_line_length - byte_offset
 
-  -- Handle case where lines match
-  if prev_end_byte_idx == 0 then
-    prev_end_byte_idx = 1
-  end
-  local prev_byte_idx, prev_char_idx = align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
+  local prev_byte_idx, prev_char_idx = align_position(prev_line, prev_end_byte_idx, false, offset_encoding)
   local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }
 
-  local curr_end_range
-  -- Deletion event, new_range cannot be before start
-  if curr_line_idx < start_line_idx then
-    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
-  else
-    local curr_end_byte_idx = curr_line_length - byte_offset + 1
-    -- Handle case where lines match
-    if curr_end_byte_idx == 0 then
-      curr_end_byte_idx = 1
-    end
-    local curr_byte_idx, curr_char_idx = align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
-    curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
-  end
+  local curr_end_byte_idx = curr_line_length - byte_offset
+
+  local curr_byte_idx, curr_char_idx = align_position(curr_line, curr_end_byte_idx, false, offset_encoding)
+  local curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
 
   return prev_end_range, curr_end_range
 end
diff --git a/test/functional/plugin/lsp/incremental_sync_spec.lua b/test/functional/plugin/lsp/incremental_sync_spec.lua
index 4e3eddb9607e..e13a5acf3c9a 100644
--- a/test/functional/plugin/lsp/incremental_sync_spec.lua
+++ b/test/functional/plugin/lsp/incremental_sync_spec.lua
@@ -327,12 +327,12 @@ describe('incremental synchronization', function()
               line = 1
             },
             ['end'] = {
-              character = 9,
+              character = 4,
               line = 1
             }
           },
-          rangeLength = 5,
-          text = "_fdsa\nhello world\n1234 asdf"
+          rangeLength = 0,
+          text = "_fdsa\nhello world\n1234"
         },
         -- redo entire deletion
         {
@@ -342,12 +342,12 @@ describe('incremental synchronization', function()
               line = 1
             },
             ['end'] = {
-              character = 9,
+              character = 4,
               line = 3
             }
           },
-          rangeLength = 27,
-          text = ' asdf'
+          rangeLength = 22,
+          text = ''
         },
       }
       local original_lines = {
@@ -460,12 +460,12 @@ describe('incremental synchronization', function()
               line = 0
             },
             ['end'] = {
-              character = 17,
+              character = 12,
               line = 0
             }
           },
-          rangeLength = 6,
-          text = '\ntest3'
+          rangeLength = 1,
+          text = '\n'
         },
       }
       test_edit({"test1 test2", "test3"}, {"J", "u"}, expected_text_changes, 'utf-16', '\n')

From 7a4877f61de0616964b8f939ad132fc256235d93 Mon Sep 17 00:00:00 2001
From: Rishikesh Vaishnav <rishhvaishnav@gmail.com>
Date: Tue, 21 Dec 2021 02:05:36 +0000
Subject: [PATCH 2/2] Replace UTF helper functions with
 `vim.lsp.util._str_utfindex_enc()`.

---
 runtime/lua/vim/lsp/sync.lua | 49 +++---------------------------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/runtime/lua/vim/lsp/sync.lua b/runtime/lua/vim/lsp/sync.lua
index c745fadf876e..f725d3272291 100644
--- a/runtime/lua/vim/lsp/sync.lua
+++ b/runtime/lua/vim/lsp/sync.lua
@@ -44,50 +44,9 @@ local M = {}
 -- local string.byte, unclear if this is necessary for JIT compilation
 local str_byte = string.byte
 local min = math.min
-local str_utfindex = vim.str_utfindex
 local str_utf_start = vim.str_utf_start
 local str_utf_end = vim.str_utf_end
 
----@private
--- Given a line, byte idx, and offset_encoding convert to the
--- utf-8, utf-16, or utf-32 index.
----@param line string the line to index into
----@param byte integer the byte idx
----@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
---@returns integer the utf idx for the given encoding
-local function byte_to_utf(line, byte, offset_encoding)
-  -- convert to 0 based indexing for str_utfindex
-  byte = byte - 1
-
-  local utf_idx
-  local _
-  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
-  if offset_encoding == 'utf-16' then
-    _, utf_idx = str_utfindex(line, byte)
-  elseif offset_encoding == 'utf-32' then
-    utf_idx, _ = str_utfindex(line, byte)
-  else
-    utf_idx = byte
-  end
-
-  -- convert to 1 based indexing
-  return utf_idx + 1
-end
-
----@private
-local function compute_line_length(line, offset_encoding)
-  local length
-  local _
-  if offset_encoding == 'utf-16' then
-     _, length = str_utfindex(line)
-  elseif offset_encoding == 'utf-32' then
-    length, _ = str_utfindex(line)
-  else
-    length = #line
-  end
-  return length
-end
-
 ---@private
 -- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
 -- utf-8 index and either the utf-16, or utf-32 index.
@@ -120,9 +79,9 @@ local function align_position(line, byte, start, offset_encoding)
     char = 1
   -- Called in the case of extending an empty line "" -> "a"
   elseif byte == #line + 1 then
-    char = compute_line_length(line, offset_encoding) + 1
+    char = vim.lsp.util._str_utfindex_enc(line, nil, offset_encoding) + 1
   else
-    char = byte_to_utf(line, byte, offset_encoding)
+    char = vim.lsp.util._str_utfindex_enc(line, byte - 1, offset_encoding) + 1
   end
 
   return byte, char
@@ -306,7 +265,7 @@ local function compute_range_length(lines, start_range, end_range, offset_encodi
   local start_line = lines[start_range.line_idx]
   local range_length
   if start_line and #start_line > 0 then
-    range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length
+    range_length = vim.lsp.util._str_utfindex_enc(start_line, nil, offset_encoding) - start_range.char_idx + 1 + line_ending_length
   else
     -- Length of newline character
     range_length = line_ending_length
@@ -316,7 +275,7 @@ local function compute_range_length(lines, start_range, end_range, offset_encodi
   for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
     -- Length full line plus newline character
     if #lines[idx] > 0 then
-      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
+      range_length = range_length + vim.lsp.util._str_utfindex_enc(lines[idx], nil, offset_encoding) + #line_ending
     else
       range_length = range_length + line_ending_length
     end