1
0

datautils.mjs 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. //@ts-check
  2. // Helpers to work with different data types
  3. // by Humans for All
  4. //
  5. /**
  6. * Given the limited context size of local LLMs and , many a times when context gets filled
  7. * between the prompt and the response, it can lead to repeating text garbage generation.
  8. * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
  9. * repeatation with slight variations. These garbage inturn can lead to overloading of the
  10. * available model context, leading to less valuable response for subsequent prompts/queries,
  11. * if chat history is sent to ai model.
  12. *
  13. * So two simple minded garbage trimming logics are experimented below.
  14. * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
  15. * * another based on char-histogram-driven garbage trimming.
  16. * * in future characteristic of histogram over varying lengths could be used to allow for
  17. * a more aggressive and adaptive trimming logic.
  18. */
  19. /**
  20. * Simple minded logic to help remove repeating garbage at end of the string.
  21. * The repeatation needs to be perfectly matching.
  22. *
  23. * The logic progressively goes on probing for longer and longer substring based
  24. * repeatation, till there is no longer repeatation. Inturn picks the one with
  25. * the longest chain.
  26. *
  27. * @param {string} sIn
  28. * @param {number} maxSubL
  29. * @param {number} maxMatchLenThreshold
  30. */
  31. export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
  32. let rCnt = [0];
  33. let maxMatchLen = maxSubL;
  34. let iMML = -1;
  35. for(let subL=1; subL < maxSubL; subL++) {
  36. rCnt.push(0);
  37. let i;
  38. let refS = sIn.substring(sIn.length-subL, sIn.length);
  39. for(i=sIn.length; i > 0; i -= subL) {
  40. let curS = sIn.substring(i-subL, i);
  41. if (refS != curS) {
  42. let curMatchLen = rCnt[subL]*subL;
  43. if (maxMatchLen < curMatchLen) {
  44. maxMatchLen = curMatchLen;
  45. iMML = subL;
  46. }
  47. break;
  48. }
  49. rCnt[subL] += 1;
  50. }
  51. }
  52. console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
  53. if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
  54. return {trimmed: false, data: sIn};
  55. }
  56. console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
  57. let iEnd = sIn.length - maxMatchLen;
  58. return { trimmed: true, data: sIn.substring(0, iEnd) };
  59. }
  60. /**
  61. * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
  62. * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
  63. * This ensures that even if there are multiple runs of garbage with different patterns, the
  64. * logic still tries to munch through them.
  65. *
  66. * @param {string} sIn
  67. * @param {number} maxSubL
  68. * @param {number | undefined} [maxMatchLenThreshold]
  69. */
  70. export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
  71. let sCur = sIn;
  72. let sSaved = "";
  73. let iTry = 0;
  74. while(true) {
  75. let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
  76. if (got.trimmed != true) {
  77. if (iTry == 0) {
  78. sSaved = got.data;
  79. }
  80. iTry += 1;
  81. if (iTry >= skipMax) {
  82. return sSaved;
  83. }
  84. got.data = got.data.substring(0,got.data.length-1);
  85. } else {
  86. iTry = 0;
  87. }
  88. sCur = got.data;
  89. }
  90. }
  91. /**
  92. * A simple minded try trim garbage at end using histogram driven characteristics.
  93. * There can be variation in the repeatations, as long as no new char props up.
  94. *
  95. * This tracks the chars and their frequency in a specified length of substring at the end
  96. * and inturn checks if moving further into the generated text from the end remains within
  97. * the same char subset or goes beyond it and based on that either trims the string at the
  98. * end or not. This allows to filter garbage at the end, including even if there are certain
  99. * kind of small variations in the repeated text wrt position of seen chars.
  100. *
  101. * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
  102. * a given type of char ie numerals or alphabets or other types dont cross the specified
  103. * maxType limit. This allows intermixed text garbage to be identified and trimmed.
  104. *
  105. * ALERT: This is not perfect and only provides a rough garbage identification logic.
  106. * Also it currently only differentiates between character classes wrt english.
  107. *
  108. * @param {string} sIn
  109. * @param {number} maxType
  110. * @param {number} maxUniq
  111. * @param {number} maxMatchLenThreshold
  112. */
  113. export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
  114. if (sIn.length < maxMatchLenThreshold) {
  115. return { trimmed: false, data: sIn };
  116. }
  117. let iAlp = 0;
  118. let iNum = 0;
  119. let iOth = 0;
  120. // Learn
  121. let hist = {};
  122. let iUniq = 0;
  123. for(let i=0; i<maxMatchLenThreshold; i++) {
  124. let c = sIn[sIn.length-1-i];
  125. if (c in hist) {
  126. hist[c] += 1;
  127. } else {
  128. if(c.match(/[0-9]/) != null) {
  129. iNum += 1;
  130. } else if(c.match(/[A-Za-z]/) != null) {
  131. iAlp += 1;
  132. } else {
  133. iOth += 1;
  134. }
  135. iUniq += 1;
  136. if (iUniq >= maxUniq) {
  137. break;
  138. }
  139. hist[c] = 1;
  140. }
  141. }
  142. console.debug("DBUG:TrimHistGarbage:", hist);
  143. if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
  144. return { trimmed: false, data: sIn };
  145. }
  146. // Catch and Trim
  147. for(let i=0; i < sIn.length; i++) {
  148. let c = sIn[sIn.length-1-i];
  149. if (!(c in hist)) {
  150. if (i < maxMatchLenThreshold) {
  151. return { trimmed: false, data: sIn };
  152. }
  153. console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
  154. return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
  155. }
  156. }
  157. console.debug("DBUG:TrimHistGarbage:Trimmed fully");
  158. return { trimmed: true, data: "" };
  159. }
  160. /**
  161. * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
  162. * This ensures that even if there are multiple runs of garbage with different patterns,
  163. * the logic still tries to munch through them.
  164. *
  165. * @param {any} sIn
  166. * @param {number} maxType
  167. * @param {number} maxUniq
  168. * @param {number} maxMatchLenThreshold
  169. */
  170. export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
  171. let sCur = sIn;
  172. while (true) {
  173. let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
  174. if (!got.trimmed) {
  175. return got.data;
  176. }
  177. sCur = got.data;
  178. }
  179. }
  180. /**
  181. * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
  182. * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
  183. * @param {string} sIn
  184. */
  185. export function trim_garbage_at_end(sIn) {
  186. let sCur = sIn;
  187. for(let i=0; i<2; i++) {
  188. sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
  189. sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
  190. }
  191. return sCur;
  192. }
  193. /**
  194. * NewLines array helper.
  195. * Allow for maintaining a list of lines.
  196. * Allow for a line to be builtup/appended part by part.
  197. */
  198. export class NewLines {
  199. constructor() {
  200. /** @type {string[]} */
  201. this.lines = [];
  202. }
  203. /**
  204. * Extracts lines from the passed string and inturn either
  205. * append to a previous partial line or add a new line.
  206. * @param {string} sLines
  207. */
  208. add_append(sLines) {
  209. let aLines = sLines.split("\n");
  210. let lCnt = 0;
  211. for(let line of aLines) {
  212. lCnt += 1;
  213. // Add back newline removed if any during split
  214. if (lCnt < aLines.length) {
  215. line += "\n";
  216. } else {
  217. if (sLines.endsWith("\n")) {
  218. line += "\n";
  219. }
  220. }
  221. // Append if required
  222. if (lCnt == 1) {
  223. let lastLine = this.lines[this.lines.length-1];
  224. if (lastLine != undefined) {
  225. if (!lastLine.endsWith("\n")) {
  226. this.lines[this.lines.length-1] += line;
  227. continue;
  228. }
  229. }
  230. }
  231. // Add new line
  232. this.lines.push(line);
  233. }
  234. }
  235. /**
  236. * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
  237. * Optionally control whether only full lines (ie those with newline at end) will be returned
  238. * or will a partial line without a newline at end (can only be the last line) be returned.
  239. * @param {boolean} bFullWithNewLineOnly
  240. */
  241. shift(bFullWithNewLineOnly=true) {
  242. let line = this.lines[0];
  243. if (line == undefined) {
  244. return undefined;
  245. }
  246. if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
  247. return undefined;
  248. }
  249. return this.lines.shift();
  250. }
  251. }