|
|
@@ -3,6 +3,10 @@
|
|
|
* Handles text file detection, reading, and validation
|
|
|
*/
|
|
|
|
|
|
+import {
|
|
|
+ DEFAULT_BINARY_DETECTION_OPTIONS,
|
|
|
+ type BinaryDetectionOptions
|
|
|
+} from '$lib/constants/binary-detection';
|
|
|
import { FileExtensionText } from '$lib/enums/files';
|
|
|
|
|
|
/**
|
|
|
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
|
|
|
* Heuristic check to determine if content is likely from a text file
|
|
|
* Detects binary files by counting suspicious characters and null bytes
|
|
|
* @param content - The file content to analyze
|
|
|
+ * @param options - Optional configuration for detection parameters
|
|
|
* @returns True if the content appears to be text-based
|
|
|
*/
|
|
|
-export function isLikelyTextFile(content: string): boolean {
|
|
|
+export function isLikelyTextFile(
|
|
|
+ content: string,
|
|
|
+ options: Partial<BinaryDetectionOptions> = {}
|
|
|
+): boolean {
|
|
|
if (!content) return true;
|
|
|
|
|
|
- const sample = content.substring(0, 1000);
|
|
|
+ const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
|
|
|
+ const sample = content.substring(0, config.prefixLength);
|
|
|
|
|
|
- let suspiciousCount = 0;
|
|
|
let nullCount = 0;
|
|
|
+ let suspiciousControlCount = 0;
|
|
|
|
|
|
for (let i = 0; i < sample.length; i++) {
|
|
|
const charCode = sample.charCodeAt(i);
|
|
|
|
|
|
- // Count null bytes
|
|
|
+ // Count null bytes - these are strong indicators of binary files
|
|
|
if (charCode === 0) {
|
|
|
nullCount++;
|
|
|
- suspiciousCount++;
|
|
|
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
- // Count suspicious control characters (excluding common ones like tab, newline, carriage return)
|
|
|
+ // Count suspicious control characters
|
|
|
+ // Allow common whitespace characters: tab (9), newline (10), carriage return (13)
|
|
|
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
|
|
|
- suspiciousCount++;
|
|
|
+ // Count most suspicious control characters
|
|
|
+ if (charCode < 8 || (charCode > 13 && charCode < 27)) {
|
|
|
+ suspiciousControlCount++;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// Count replacement characters (indicates encoding issues)
|
|
|
if (charCode === 0xfffd) {
|
|
|
- suspiciousCount++;
|
|
|
+ suspiciousControlCount++;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // Reject if too many null bytes or suspicious characters
|
|
|
- if (nullCount > 2) return false;
|
|
|
- if (suspiciousCount / sample.length > 0.1) return false;
|
|
|
+ // Reject if too many null bytes
|
|
|
+ if (nullCount > config.maxAbsoluteNullBytes) return false;
|
|
|
+
|
|
|
+ // Reject if too many suspicious characters
|
|
|
+ if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
|
|
|
|
|
|
return true;
|
|
|
}
|