1
0
Эх сурвалжийг харах

Enhance text file detection logic for file attachments (#16199)

* feat: Enhances text file detection logic

* chore: Build static `webui` output

* chore: update webui build output
Aleksander Grygier 3 сар өмнө
parent
commit
807e8c6d31

BIN
tools/server/public/index.html.gz


+ 14 - 0
tools/server/webui/src/lib/constants/binary-detection.ts

@@ -0,0 +1,14 @@
+export interface BinaryDetectionOptions {
+	/** Number of characters to check from the beginning of the file */
+	prefixLength: number;
+	/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
+	suspiciousCharThresholdRatio: number;
+	/** Maximum absolute number of null bytes allowed */
+	maxAbsoluteNullBytes: number;
+}
+
+export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
+	prefixLength: 1024 * 10, // Check the first 10KB of the string
+	suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+	maxAbsoluteNullBytes: 2
+};

+ 8 - 0
tools/server/webui/src/lib/constants/supported-file-types.ts

@@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = {
 	[FileTypeText.SVELTE]: {
 		extensions: [FileExtensionText.SVELTE],
 		mimeTypes: [MimeTypeText.SVELTE]
+	},
+	[FileTypeText.LATEX]: {
+		extensions: [FileExtensionText.TEX],
+		mimeTypes: [MimeTypeText.LATEX]
+	},
+	[FileTypeText.BIBTEX]: {
+		extensions: [FileExtensionText.BIB],
+		mimeTypes: [MimeTypeText.BIBTEX]
 	}
 } as const;

+ 9 - 3
tools/server/webui/src/lib/enums/files.ts

@@ -59,7 +59,9 @@ export enum FileTypeText {
 	SWIFT = 'swift',
 	DART = 'dart',
 	VUE = 'vue',
-	SVELTE = 'svelte'
+	SVELTE = 'svelte',
+	LATEX = 'latex',
+	BIBTEX = 'bibtex'
 }
 
 // File extension enums
@@ -115,7 +117,9 @@ export enum FileExtensionText {
 	SWIFT = '.swift',
 	DART = '.dart',
 	VUE = '.vue',
-	SVELTE = '.svelte'
+	SVELTE = '.svelte',
+	TEX = '.tex',
+	BIB = '.bib'
 }
 
 // MIME type enums
@@ -174,5 +178,7 @@ export enum MimeTypeText {
 	SWIFT = 'text/x-swift',
 	DART = 'text/x-dart',
 	VUE = 'text/x-vue',
-	SVELTE = 'text/x-svelte'
+	SVELTE = 'text/x-svelte',
+	LATEX = 'text/x-tex',
+	BIBTEX = 'text/x-bibtex'
 }

+ 25 - 11
tools/server/webui/src/lib/utils/text-files.ts

@@ -3,6 +3,10 @@
  * Handles text file detection, reading, and validation
  */
 
+import {
+	DEFAULT_BINARY_DETECTION_OPTIONS,
+	type BinaryDetectionOptions
+} from '$lib/constants/binary-detection';
 import { FileExtensionText } from '$lib/enums/files';
 
 /**
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
  * Heuristic check to determine if content is likely from a text file
  * Detects binary files by counting suspicious characters and null bytes
  * @param content - The file content to analyze
+ * @param options - Optional configuration for detection parameters
  * @returns True if the content appears to be text-based
  */
-export function isLikelyTextFile(content: string): boolean {
+export function isLikelyTextFile(
+	content: string,
+	options: Partial<BinaryDetectionOptions> = {}
+): boolean {
 	if (!content) return true;
 
-	const sample = content.substring(0, 1000);
+	const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
+	const sample = content.substring(0, config.prefixLength);
 
-	let suspiciousCount = 0;
 	let nullCount = 0;
+	let suspiciousControlCount = 0;
 
 	for (let i = 0; i < sample.length; i++) {
 		const charCode = sample.charCodeAt(i);
 
-		// Count null bytes
+		// Count null bytes - these are strong indicators of binary files
 		if (charCode === 0) {
 			nullCount++;
-			suspiciousCount++;
 
 			continue;
 		}
 
-		// Count suspicious control characters (excluding common ones like tab, newline, carriage return)
+		// Count suspicious control characters
+		// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
 		if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
-			suspiciousCount++;
+			// Count most suspicious control characters
+			if (charCode < 8 || (charCode > 13 && charCode < 27)) {
+				suspiciousControlCount++;
+			}
 		}
 
 		// Count replacement characters (indicates encoding issues)
 		if (charCode === 0xfffd) {
-			suspiciousCount++;
+			suspiciousControlCount++;
 		}
 	}
 
-	// Reject if too many null bytes or suspicious characters
-	if (nullCount > 2) return false;
-	if (suspiciousCount / sample.length > 0.1) return false;
+	// Reject if too many null bytes
+	if (nullCount > config.maxAbsoluteNullBytes) return false;
+
+	// Reject if too many suspicious characters
+	if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
 
 	return true;
 }