1
0

llama.vim 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. " LLM-based text completion using llama.cpp
  2. "
  3. " requires:
  4. "
  5. " - neovim or vim
  6. " - curl
  7. " - llama.cpp server instance
  8. " - FIM-compatible model
  9. "
  10. " sample config:
  11. "
  12. " - Tab - accept the current suggestion
  13. " - Shift+Tab - accept just the first line of the suggestion
  14. " - Ctrl+F - toggle FIM completion manually
  15. "
  16. " make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
  17. "
  18. " start the llama.cpp server with a FIM-compatible model. for example:
  19. "
  20. " $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
  21. "
  22. " --batch-size [512, model max context]
  23. "
  24. " adjust the batch size to control how much of the provided local context will be used during the inference
  25. " lower values will use smaller part of the context around the cursor, which will result in faster processing
  26. "
  27. " --ubatch-size [64, 2048]
  28. "
  29. " chunks the batch into smaller chunks for faster processing
  30. " depends on the specific hardware. use llama-bench to profile and determine the best size
  31. "
  32. " --cache-reuse (ge:llama_config.n_predict, 1024]
  33. "
  34. " this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
  35. " using non-zero value enables context reuse on the server side which dramatically improves the performance at
  36. " large contexts. a value of 256 should be good for all cases
  37. "
  38. " run this once to initialise llama.vim:
  39. "
  40. " :call llama#init()
  41. "
  42. " more info: https://github.com/ggerganov/llama.cpp/pull/9787
  43. "
  44. " colors (adjust to your liking)
  45. highlight llama_hl_hint guifg=#ff772f ctermfg=202
  46. highlight llama_hl_info guifg=#77ff2f ctermfg=119
  47. " general parameters:
  48. "
  49. " endpoint: llama.cpp server endpoint
  50. " n_prefix: number of lines before the cursor location to include in the local prefix
  51. " n_suffix: number of lines after the cursor location to include in the local suffix
  52. " n_predict: max number of tokens to predict
  53. " t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported)
  54. " t_max_predict_ms: max alloted time for the prediction
  55. " show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
  56. " auto_fim: trigger FIM completion automatically on cursor movement
  57. " max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
  58. "
  59. " ring buffer of chunks, accumulated with time upon:
  60. "
  61. " - completion request
  62. " - yank
  63. " - entering a buffer
  64. " - leaving a buffer
  65. " - writing a file
  66. "
  67. " parameters for the ring-buffer with extra context:
  68. "
  69. " ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
  70. " ring_chunk_size: max size of the chunks (in number of lines)
  71. " note: adjust these numbers so that you don't overrun your context
  72. " at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
  73. " ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM
  74. " ring_update_ms: how often to process queued chunks in normal mode
  75. "
  76. let s:default_config = {
  77. \ 'endpoint': 'http://127.0.0.1:8012/infill',
  78. \ 'n_prefix': 256,
  79. \ 'n_suffix': 64,
  80. \ 'n_predict': 128,
  81. \ 't_max_prompt_ms': 500,
  82. \ 't_max_predict_ms': 3000,
  83. \ 'show_info': 2,
  84. \ 'auto_fim': v:true,
  85. \ 'max_line_suffix': 8,
  86. \ 'ring_n_chunks': 64,
  87. \ 'ring_chunk_size': 64,
  88. \ 'ring_scope': 1024,
  89. \ 'ring_update_ms': 1000,
  90. \ }
  91. let g:llama_config = get(g:, 'llama_config', s:default_config)
  92. function! s:get_indent(str)
  93. let l:count = 0
  94. for i in range(len(a:str))
  95. if a:str[i] == "\t"
  96. let l:count += &tabstop - 1
  97. else
  98. break
  99. endif
  100. endfor
  101. return l:count
  102. endfunction
  103. function! s:rand(i0, i1) abort
  104. return a:i0 + rand() % (a:i1 - a:i0 + 1)
  105. endfunction
  106. function! llama#init()
  107. if !executable('curl')
  108. echohl WarningMsg
  109. echo 'llama.vim requires the "curl" command to be available'
  110. echohl None
  111. return
  112. endif
  113. let s:pos_x = 0 " cursor position upon start of completion
  114. let s:pos_y = 0
  115. let s:line_cur = ''
  116. let s:line_cur_prefix = ''
  117. let s:line_cur_suffix = ''
  118. let s:ring_chunks = [] " current set of chunks used as extra context
  119. let s:ring_queued = [] " chunks that are queued to be sent for processing
  120. let s:ring_n_evict = 0
  121. let s:hint_shown = v:false
  122. let s:pos_y_pick = -9999 " last y where we picked a chunk
  123. let s:pos_dx = 0
  124. let s:content = []
  125. let s:can_accept = v:false
  126. let s:timer_fim = -1
  127. let s:t_fim_start = reltime() " used to measure total FIM time
  128. let s:t_last_move = reltime() " last time the cursor moved
  129. let s:current_job = v:null
  130. let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
  131. let s:ghost_text_vim = has('textprop')
  132. if s:ghost_text_vim
  133. let s:hlgroup_hint = 'llama_hl_hint'
  134. let s:hlgroup_info = 'llama_hl_info'
  135. if empty(prop_type_get(s:hlgroup_hint))
  136. call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
  137. endif
  138. if empty(prop_type_get(s:hlgroup_info))
  139. call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
  140. endif
  141. endif
  142. augroup llama
  143. autocmd!
  144. autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
  145. autocmd InsertLeavePre * call llama#fim_cancel()
  146. autocmd CursorMoved * call s:on_move()
  147. autocmd CursorMovedI * call s:on_move()
  148. autocmd CompleteChanged * call llama#fim_cancel()
  149. if g:llama_config.auto_fim
  150. autocmd CursorMovedI * call llama#fim(v:true)
  151. endif
  152. " gather chunks upon yanking
  153. autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
  154. " gather chunks upon entering/leaving a buffer
  155. autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
  156. autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
  157. " gather chunk upon saving the file
  158. autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
  159. augroup END
  160. silent! call llama#fim_cancel()
  161. " init background update of the ring buffer
  162. if g:llama_config.ring_n_chunks > 0
  163. call s:ring_update()
  164. endif
  165. endfunction
  166. " compute how similar two chunks of text are
  167. " 0 - no similarity, 1 - high similarity
  168. " TODO: figure out something better
  169. function! s:chunk_sim(c0, c1)
  170. let l:lines0 = len(a:c0)
  171. let l:lines1 = len(a:c1)
  172. let l:common = 0
  173. for l:line0 in a:c0
  174. for l:line1 in a:c1
  175. if l:line0 == l:line1
  176. let l:common += 1
  177. break
  178. endif
  179. endfor
  180. endfor
  181. return 2.0 * l:common / (l:lines0 + l:lines1)
  182. endfunction
  183. " pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
  184. "
  185. " no_mod - do not pick chunks from buffers with pending changes
  186. " do_evict - evict chunks that are very similar to the new one
  187. "
  188. function! s:pick_chunk(text, no_mod, do_evict)
  189. " do not pick chunks from buffers with pending changes or buffers that are not files
  190. if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
  191. return
  192. endif
  193. " if the extra context option is disabled - do nothing
  194. if g:llama_config.ring_n_chunks <= 0
  195. return
  196. endif
  197. " don't pick very small chunks
  198. if len(a:text) < 3
  199. return
  200. endif
  201. if len(a:text) + 1 < g:llama_config.ring_chunk_size
  202. let l:chunk = a:text
  203. else
  204. let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
  205. let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
  206. let l:chunk = a:text[l:l0:l:l1]
  207. endif
  208. let l:chunk_str = join(l:chunk, "\n") . "\n"
  209. " check if this chunk is already added
  210. let l:exist = v:false
  211. for i in range(len(s:ring_chunks))
  212. if s:ring_chunks[i].data == l:chunk
  213. let l:exist = v:true
  214. break
  215. endif
  216. endfor
  217. for i in range(len(s:ring_queued))
  218. if s:ring_queued[i].data == l:chunk
  219. let l:exist = v:true
  220. break
  221. endif
  222. endfor
  223. if l:exist
  224. return
  225. endif
  226. " evict queued chunks that are very similar to the new one
  227. for i in range(len(s:ring_queued) - 1, 0, -1)
  228. if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
  229. if a:do_evict
  230. call remove(s:ring_queued, i)
  231. let s:ring_n_evict += 1
  232. else
  233. return
  234. endif
  235. endif
  236. endfor
  237. " also from s:ring_chunks
  238. for i in range(len(s:ring_chunks) - 1, 0, -1)
  239. if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
  240. if a:do_evict
  241. call remove(s:ring_chunks, i)
  242. let s:ring_n_evict += 1
  243. else
  244. return
  245. endif
  246. endif
  247. endfor
  248. " TODO: become parameter ?
  249. if len(s:ring_queued) == 16
  250. call remove(s:ring_queued, 0)
  251. endif
  252. call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
  253. "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
  254. endfunction
  255. " picks a queued chunk, sends it for processing and adds it to s:ring_chunks
  256. " called every g:llama_config.ring_update_ms
  257. function! s:ring_update()
  258. call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
  259. " update only if in normal mode or if the cursor hasn't moved for a while
  260. if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
  261. return
  262. endif
  263. if len(s:ring_queued) == 0
  264. return
  265. endif
  266. " move the first queued chunk to the ring buffer
  267. if len(s:ring_chunks) == g:llama_config.ring_n_chunks
  268. call remove(s:ring_chunks, 0)
  269. endif
  270. call add(s:ring_chunks, remove(s:ring_queued, 0))
  271. "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
  272. " send asynchronous job with the new extra context so that it is ready for the next FIM
  273. let l:extra_context = []
  274. for l:chunk in s:ring_chunks
  275. call add(l:extra_context, {
  276. \ 'text': l:chunk.str,
  277. \ 'time': l:chunk.time,
  278. \ 'filename': l:chunk.filename
  279. \ })
  280. endfor
  281. " no samplers needed here
  282. let l:request = json_encode({
  283. \ 'input_prefix': "",
  284. \ 'input_suffix': "",
  285. \ 'input_extra': l:extra_context,
  286. \ 'prompt': "",
  287. \ 'n_predict': 1,
  288. \ 'temperature': 0.0,
  289. \ 'stream': v:false,
  290. \ 'samplers': ["temperature"],
  291. \ 'cache_prompt': v:true,
  292. \ 't_max_prompt_ms': 1,
  293. \ 't_max_predict_ms': 1
  294. \ })
  295. let l:curl_command = [
  296. \ "curl",
  297. \ "--silent",
  298. \ "--no-buffer",
  299. \ "--request", "POST",
  300. \ "--url", g:llama_config.endpoint,
  301. \ "--header", "Content-Type: application/json",
  302. \ "--data", l:request
  303. \ ]
  304. " no callbacks because we don't need to process the response
  305. if s:ghost_text_nvim
  306. call jobstart(l:curl_command, {})
  307. elseif s:ghost_text_vim
  308. call job_start(l:curl_command, {})
  309. endif
  310. endfunction
  311. " necessary for 'inoremap <expr>'
  312. function! llama#fim_inline(is_auto) abort
  313. call llama#fim(a:is_auto)
  314. return ''
  315. endfunction
  316. " the main FIM call
  317. " takes local context around the cursor and sends it together with the extra context to the server for completion
  318. function! llama#fim(is_auto) abort
  319. " we already have a suggestion for the current cursor position
  320. if s:hint_shown && !a:is_auto
  321. call llama#fim_cancel()
  322. return
  323. endif
  324. call llama#fim_cancel()
  325. " avoid sending repeated requests too fast
  326. if reltimefloat(reltime(s:t_fim_start)) < 0.6
  327. if s:timer_fim != -1
  328. call timer_stop(s:timer_fim)
  329. let s:timer_fim = -1
  330. endif
  331. let s:t_fim_start = reltime()
  332. let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
  333. return
  334. endif
  335. let s:t_fim_start = reltime()
  336. let s:content = []
  337. let s:can_accept = v:false
  338. let s:pos_x = col('.') - 1
  339. let s:pos_y = line('.')
  340. let l:max_y = line('$')
  341. let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
  342. let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
  343. let s:line_cur = getline('.')
  344. let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
  345. let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
  346. if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
  347. return
  348. endif
  349. let l:prefix = ""
  350. \ . join(l:lines_prefix, "\n")
  351. \ . "\n"
  352. let l:prompt = ""
  353. \ . s:line_cur_prefix
  354. let l:suffix = ""
  355. \ . s:line_cur_suffix
  356. \ . "\n"
  357. \ . join(l:lines_suffix, "\n")
  358. \ . "\n"
  359. " prepare the extra context data
  360. let l:extra_context = []
  361. for l:chunk in s:ring_chunks
  362. call add(l:extra_context, {
  363. \ 'text': l:chunk.str,
  364. \ 'time': l:chunk.time,
  365. \ 'filename': l:chunk.filename
  366. \ })
  367. endfor
  368. " the indentation of the current line
  369. let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
  370. let l:request = json_encode({
  371. \ 'input_prefix': l:prefix,
  372. \ 'input_suffix': l:suffix,
  373. \ 'input_extra': l:extra_context,
  374. \ 'prompt': l:prompt,
  375. \ 'n_predict': g:llama_config.n_predict,
  376. \ 'n_indent': l:indent,
  377. \ 'top_k': 40,
  378. \ 'top_p': 0.99,
  379. \ 'stream': v:false,
  380. \ 'samplers': ["top_k", "top_p", "infill"],
  381. \ 'cache_prompt': v:true,
  382. \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms,
  383. \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
  384. \ })
  385. let l:curl_command = [
  386. \ "curl",
  387. \ "--silent",
  388. \ "--no-buffer",
  389. \ "--request", "POST",
  390. \ "--url", g:llama_config.endpoint,
  391. \ "--header", "Content-Type: application/json",
  392. \ "--data", l:request
  393. \ ]
  394. if s:current_job != v:null
  395. if s:ghost_text_nvim
  396. call jobstop(s:current_job)
  397. elseif s:ghost_text_vim
  398. call job_stop(s:current_job)
  399. endif
  400. endif
  401. " send the request asynchronously
  402. if s:ghost_text_nvim
  403. let s:current_job = jobstart(l:curl_command, {
  404. \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
  405. \ 'on_exit': function('s:fim_on_exit'),
  406. \ 'stdout_buffered': v:true
  407. \ })
  408. elseif s:ghost_text_vim
  409. let s:current_job = job_start(l:curl_command, {
  410. \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
  411. \ 'exit_cb': function('s:fim_on_exit')
  412. \ })
  413. endif
  414. " TODO: per-file location
  415. let l:delta_y = abs(s:pos_y - s:pos_y_pick)
  416. " gather some extra context nearby and process it in the background
  417. " only gather chunks if the cursor has moved a lot
  418. " TODO: something more clever? reranking?
  419. if a:is_auto && l:delta_y > 32
  420. " expand the prefix even further
  421. call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
  422. " pick a suffix chunk
  423. call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
  424. let s:pos_y_pick = s:pos_y
  425. endif
  426. endfunction
  427. " if first_line == v:true accept only the first line of the response
  428. function! llama#fim_accept(first_line)
  429. " insert the suggestion at the cursor location
  430. if s:can_accept && len(s:content) > 0
  431. call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
  432. if len(s:content) > 1
  433. if !a:first_line
  434. call append(s:pos_y, s:content[1:-1])
  435. endif
  436. endif
  437. " move the cursor to the end of the accepted text
  438. if !a:first_line && len(s:content) > 1
  439. call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
  440. else
  441. call cursor(s:pos_y, s:pos_x + len(s:content[0]))
  442. endif
  443. endif
  444. call llama#fim_cancel()
  445. endfunction
  446. function! llama#fim_cancel()
  447. let s:hint_shown = v:false
  448. " clear the virtual text
  449. let l:bufnr = bufnr('%')
  450. if s:ghost_text_nvim
  451. let l:id_vt_fim = nvim_create_namespace('vt_fim')
  452. call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
  453. elseif s:ghost_text_vim
  454. call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
  455. call prop_remove({'type': s:hlgroup_info, 'all': v:true})
  456. endif
  457. " remove the mappings
  458. silent! iunmap <buffer> <Tab>
  459. silent! iunmap <buffer> <S-Tab>
  460. silent! iunmap <buffer> <Esc>
  461. endfunction
  462. function! s:on_move()
  463. let s:t_last_move = reltime()
  464. call llama#fim_cancel()
  465. endfunction
  466. " callback that processes the FIM result from the server and displays the suggestion
  467. function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
  468. if s:ghost_text_nvim
  469. let l:raw = join(a:data, "\n")
  470. elseif s:ghost_text_vim
  471. let l:raw = a:data
  472. endif
  473. if len(l:raw) == 0
  474. return
  475. endif
  476. if a:pos_x != col('.') - 1 || a:pos_y != line('.')
  477. return
  478. endif
  479. " show the suggestion only in insert mode
  480. if mode() !=# 'i'
  481. return
  482. endif
  483. let s:pos_x = a:pos_x
  484. let s:pos_y = a:pos_y
  485. let s:can_accept = v:true
  486. let l:has_info = v:false
  487. if s:can_accept && v:shell_error
  488. if !a:is_auto
  489. call add(s:content, "<| curl error: is the server on? |>")
  490. endif
  491. let s:can_accept = v:false
  492. endif
  493. let l:n_prompt = 0
  494. let l:t_prompt_ms = 1.0
  495. let l:s_prompt = 0
  496. let l:n_predict = 0
  497. let l:t_predict_ms = 1.0
  498. let l:s_predict = 0
  499. " get the generated suggestion
  500. if s:can_accept
  501. let l:response = json_decode(l:raw)
  502. for l:part in split(get(l:response, 'content', ''), "\n", 1)
  503. call add(s:content, l:part)
  504. endfor
  505. " remove trailing new lines
  506. while len(s:content) > 0 && s:content[-1] == ""
  507. call remove(s:content, -1)
  508. endwhile
  509. let l:generation_settings = get(l:response, 'generation_settings', {})
  510. let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
  511. let l:n_cached = get(l:response, 'tokens_cached', 0)
  512. let l:truncated = get(l:response, 'truncated', v:false)
  513. " if response.timings is available
  514. if len(get(l:response, 'timings', {})) > 0
  515. let l:has_info = v:true
  516. let l:timings = get(l:response, 'timings', {})
  517. let l:n_prompt = get(l:timings, 'prompt_n', 0)
  518. let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
  519. let l:s_prompt = get(l:timings, 'prompt_per_second', 0)
  520. let l:n_predict = get(l:timings, 'predicted_n', 0)
  521. let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
  522. let l:s_predict = get(l:timings, 'predicted_per_second', 0)
  523. endif
  524. endif
  525. if len(s:content) == 0
  526. call add(s:content, "")
  527. let s:can_accept = v:false
  528. endif
  529. if len(s:content) == 0
  530. return
  531. endif
  532. " NOTE: the following is logic for discarding predictions that repeat existing text
  533. " the code is quite ugly and there is very likely a simpler and more canonical way to implement this
  534. "
  535. " still, I wonder if there is some better way that avoids having to do these special hacks?
  536. " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
  537. " start generating whatever we have given it via the extra context. but on the other hand, it's not very
  538. " helpful to re-generate the same code that is already there
  539. " truncate the suggestion if the first line is empty
  540. if len(s:content) == 1 && s:content[0] == ""
  541. let s:content = [""]
  542. endif
  543. " ... and the next lines are repeated
  544. if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
  545. let s:content = [""]
  546. endif
  547. " truncate the suggestion if it repeats the suffix
  548. if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
  549. let s:content = [""]
  550. endif
  551. " find the first non-empty line (strip whitespace)
  552. let l:cmp_y = s:pos_y + 1
  553. while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
  554. let l:cmp_y += 1
  555. endwhile
  556. if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
  557. " truncate the suggestion if it repeats the next line
  558. if len(s:content) == 1
  559. let s:content = [""]
  560. endif
  561. " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
  562. if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
  563. let s:content = [""]
  564. endif
  565. " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
  566. if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
  567. let s:content = [""]
  568. endif
  569. endif
  570. " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
  571. "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
  572. "for i in range(1, len(s:content) - 1)
  573. " if strlen(matchstr(s:content[i], '^\s*')) < l:indent
  574. " let s:content = s:content[:i - 1]
  575. " break
  576. " endif
  577. "endfor
  578. let s:pos_dx = len(s:content[-1])
  579. let s:content[-1] .= s:line_cur_suffix
  580. call llama#fim_cancel()
  581. " display virtual text with the suggestion
  582. let l:bufnr = bufnr('%')
  583. if s:ghost_text_nvim
  584. let l:id_vt_fim = nvim_create_namespace('vt_fim')
  585. endif
  586. " construct the info message
  587. if g:llama_config.show_info > 0 && l:has_info
  588. let l:prefix = ' '
  589. if l:truncated
  590. let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
  591. \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
  592. \ l:n_cached, l:n_ctx
  593. \ )
  594. else
  595. let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
  596. \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
  597. \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
  598. \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
  599. \ l:n_predict, l:t_predict_ms, l:s_predict,
  600. \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
  601. \ )
  602. endif
  603. if g:llama_config.show_info == 1
  604. " display the info in the statusline
  605. let &statusline = l:info
  606. let l:info = ''
  607. endif
  608. endif
  609. " display the suggestion and append the info to the end of the first line
  610. if s:ghost_text_nvim
  611. call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
  612. \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
  613. \ 'virt_text_win_col': virtcol('.') - 1
  614. \ })
  615. call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
  616. \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
  617. \ 'virt_text_win_col': virtcol('.')
  618. \ })
  619. elseif s:ghost_text_vim
  620. let l:new_suffix = s:content[0]
  621. if !empty(l:new_suffix)
  622. call prop_add(s:pos_y, s:pos_x + 1, {
  623. \ 'type': s:hlgroup_hint,
  624. \ 'text': l:new_suffix
  625. \ })
  626. endif
  627. for line in s:content[1:]
  628. call prop_add(s:pos_y, 0, {
  629. \ 'type': s:hlgroup_hint,
  630. \ 'text': line,
  631. \ 'text_padding_left': s:get_indent(line),
  632. \ 'text_align': 'below'
  633. \ })
  634. endfor
  635. if !empty(l:info)
  636. call prop_add(s:pos_y, 0, {
  637. \ 'type': s:hlgroup_info,
  638. \ 'text': l:info,
  639. \ 'text_padding_left': col('$'),
  640. \ 'text_wrap': 'truncate'
  641. \ })
  642. endif
  643. endif
  644. " setup accept shortcuts
  645. inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR>
  646. inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
  647. let s:hint_shown = v:true
  648. endfunction
  649. function! s:fim_on_exit(job_id, exit_code, event = v:null)
  650. if a:exit_code != 0
  651. echom "Job failed with exit code: " . a:exit_code
  652. endif
  653. let s:current_job = v:null
  654. endfunction