From 166c4d54efc9e8c10502db13c094d7de3f245ce5 Mon Sep 17 00:00:00 2001
From: Rishikesh Vaishnav <rishhvaishnav@gmail.com>
Date: Wed, 15 Dec 2021 21:19:43 +0000
Subject: [PATCH] fix(lsp): incremental sync UTF fixes (#16624)

Aligning end position:
- fix check for preexisting UTF-8 alignment:
  check `vim.str_utf_start() == 0` instead of `vim.str_utf_end() == 0`
- fix setting of byte index when not already aligned:
  set to beginning of next codepoint rather than end of this one
- set char index after aligning byte index, removing unnecessary
  adjustment of byte index when already aligned

Aligning start position:
- fix setting of char index:
  use aligned byte index rather than original in `byte_to_utf()`

Add tests to cover these fixes as well as some other previously untested paths.
---
 runtime/lua/vim/lsp/sync.lua                  |  15 +-
 .../plugin/lsp/incremental_sync_spec.lua      | 269 ++++++++++++++++++
 2 files changed, 277 insertions(+), 7 deletions(-)

diff --git a/runtime/lua/vim/lsp/sync.lua b/runtime/lua/vim/lsp/sync.lua
index 5df2a4d144d2..d01f45ad8f2a 100644
--- a/runtime/lua/vim/lsp/sync.lua
+++ b/runtime/lua/vim/lsp/sync.lua
@@ -105,15 +105,16 @@ local function align_end_position(line, byte, offset_encoding)
     char = compute_line_length(line, offset_encoding) + 1
   else
     -- Modifying line, find the nearest utf codepoint
-    local offset = str_utf_end(line, byte)
+    local offset = str_utf_start(line, byte)
     -- If the byte does not fall on the start of the character, then
     -- align to the start of the next character.
-    if offset > 0 then
-      char = byte_to_utf(line, byte, offset_encoding) + 1
-      byte = byte + offset
-    else
+    if offset < 0 then
+      byte = byte + str_utf_end(line, byte) + 1
+    end
+    if byte <= #line then
       char = byte_to_utf(line, byte, offset_encoding)
-      byte = byte + offset
+    else
+      char = compute_line_length(line, offset_encoding) + 1
     end
     -- Extending line, find the nearest utf codepoint for the last valid character
   end
@@ -167,7 +168,7 @@ local function compute_start_range(prev_lines, curr_lines, firstline, lastline,
     char_idx = compute_line_length(prev_line, offset_encoding)  + 1
   else
     byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
-    char_idx = byte_to_utf(prev_line, start_byte_idx, offset_encoding)
+    char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
   end
 
   -- Return the start difference (shared for new and prev lines)
diff --git a/test/functional/plugin/lsp/incremental_sync_spec.lua b/test/functional/plugin/lsp/incremental_sync_spec.lua
index 5dd34e766528..4e3eddb9607e 100644
--- a/test/functional/plugin/lsp/incremental_sync_spec.lua
+++ b/test/functional/plugin/lsp/incremental_sync_spec.lua
@@ -164,6 +164,201 @@ describe('incremental synchronization', function()
       }
       test_edit({"a"}, {"rb"}, expected_text_changes, 'utf-16', '\n')
     end)
+    it('deleting a line', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 0
+            },
+            ['end'] = {
+              character = 0,
+              line = 1
+            }
+          },
+          rangeLength = 12,
+          text = ''
+        }
+      }
+      test_edit({"hello world"}, {"dd"}, expected_text_changes, 'utf-16', '\n')
+    end)
+    it('deleting an empty line', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 0,
+              line = 2
+            }
+          },
+          rangeLength = 1,
+          text = ''
+        }
+      }
+      test_edit({"hello world", ""}, {"jdd"}, expected_text_changes, 'utf-16', '\n')
+    end)
+    it('adding a line', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 0,
+              line = 1
+            }
+          },
+          rangeLength = 0,
+          text = 'hello world\n'
+        }
+      }
+      test_edit({"hello world"}, {"yyp"}, expected_text_changes, 'utf-16', '\n')
+    end)
+    it('adding an empty line', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 0,
+              line = 1
+            }
+          },
+          rangeLength = 0,
+          text = '\n'
+        }
+      }
+      test_edit({"hello world"}, {"o"}, expected_text_changes, 'utf-16', '\n')
+    end)
+  end)
+  describe('multi line edit', function()
+    it('deletion and insertion', function()
+      local expected_text_changes = {
+        -- delete "_fsda" from end of line 1
+        {
+          range = {
+            ['start'] = {
+              character = 4,
+              line = 1
+            },
+            ['end'] = {
+              character = 9,
+              line = 1
+            }
+          },
+          rangeLength = 5,
+          text = ''
+        },
+        -- delete "hello world\n" from line 2
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 2
+            },
+            ['end'] = {
+              character = 0,
+              line = 3
+            }
+          },
+          rangeLength = 12,
+          text = ''
+        },
+        -- delete "1234" from beginning of line 2
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 2
+            },
+            ['end'] = {
+              character = 4,
+              line = 2
+            }
+          },
+          rangeLength = 4,
+          text = ''
+        },
+        -- add " asdf" to end of line 1
+        {
+          range = {
+            ['start'] = {
+              character = 4,
+              line = 1
+            },
+            ['end'] = {
+              character = 4,
+              line = 1
+            }
+          },
+          rangeLength = 0,
+          text = ' asdf'
+        },
+        -- delete " asdf\n" from line 2
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 2
+            },
+            ['end'] = {
+              character = 0,
+              line = 3
+            }
+          },
+          rangeLength = 6,
+          text = ''
+        },
+        -- undo entire deletion
+        {
+          range = {
+            ['start'] = {
+              character = 4,
+              line = 1
+            },
+            ['end'] = {
+              character = 9,
+              line = 1
+            }
+          },
+          rangeLength = 5,
+          text = "_fdsa\nhello world\n1234 asdf"
+        },
+        -- redo entire deletion
+        {
+          range = {
+            ['start'] = {
+              character = 4,
+              line = 1
+            },
+            ['end'] = {
+              character = 9,
+              line = 3
+            }
+          },
+          rangeLength = 27,
+          text = ' asdf'
+        },
+      }
+      local original_lines = {
+        "\\begin{document}",
+        "test_fdsa",
+        "hello world",
+        "1234 asdf",
+        "\\end{document}"
+      }
+      test_edit(original_lines, {"jf_vejjbhhdu<C-R>"}, expected_text_changes, 'utf-16', '\n')
+    end)
   end)
 
   describe('multi-operation edits', function()
@@ -297,6 +492,80 @@ describe('incremental synchronization', function()
       }
       test_edit({"🔥"}, {"x"}, expected_text_changes, 'utf-16', '\n')
     end)
+    it('replacing a multibyte character with matching prefix', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 1,
+              line = 1
+            }
+          },
+          rangeLength = 1,
+          text = '⟩'
+        }
+      }
+      -- ⟨ is e29fa8, ⟩ is e29fa9
+      local original_lines = {
+        "\\begin{document}",
+        "⟨",
+        "\\end{document}",
+      }
+      test_edit(original_lines, {"jr⟩"}, expected_text_changes, 'utf-16', '\n')
+    end)
+    it('replacing a multibyte character with matching suffix', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 1,
+              line = 1
+            }
+          },
+          rangeLength = 1,
+          text = 'ḟ'
+        }
+      }
+      -- ฟ is e0b89f, ḟ is e1b89f
+      local original_lines = {
+        "\\begin{document}",
+        "ฟ",
+        "\\end{document}",
+      }
+      test_edit(original_lines, {"jrḟ"}, expected_text_changes, 'utf-16', '\n')
+    end)
+    it('inserting before a multibyte character', function()
+      local expected_text_changes = {
+        {
+          range = {
+            ['start'] = {
+              character = 0,
+              line = 1
+            },
+            ['end'] = {
+              character = 0,
+              line = 1
+            }
+          },
+          rangeLength = 0,
+          text = ' '
+        }
+      }
+      local original_lines = {
+        "\\begin{document}",
+        "→",
+        "\\end{document}",
+      }
+      test_edit(original_lines, {"ji "}, expected_text_changes, 'utf-16', '\n')
+    end)
     it('deleting a multibyte character from a long line', function()
       local expected_text_changes = {
         {