import { estimateTokens, TokenizerModel } from "@cosine/common"
import { AIModel } from "../models"

export function splitIntoChunks(model: AIModel, str: string, chunkSize: number): string[] {
  // How aggressivley we should shrink the chunk
  const AGGRESSION_FACTOR = 1.5

  let startIdx = 0
  let endIdx = chunkSize
  const chunks: string[] = []

  while (startIdx < str.length) {
    let chunk = str.slice(startIdx, endIdx)
    let chunkTokens = estimateTokens(chunk, model.id as TokenizerModel)

    // Shrink the chunk until it fits
    while (chunkTokens > model.maxTokens) {
      const overflowTokens = chunkTokens - model.maxTokens
      const reductionRatio = overflowTokens / chunkTokens

      // Reduce endIdx based on the ratio of overflow tokens
      endIdx -= Math.floor(chunk.length * reductionRatio * AGGRESSION_FACTOR)

      chunk = str.slice(startIdx, endIdx)
      chunkTokens = estimateTokens(chunk, model.id as TokenizerModel)
    }

    chunks.push(chunk)

    startIdx = endIdx
    endIdx = startIdx + chunkSize

    // If we are close to the end check if we can, check if the rest of the string fits
    if (endIdx >= str.length && startIdx < str.length && estimateTokens(str.slice(startIdx), model.id as TokenizerModel) > model.maxTokens) {
      chunks.push(str.slice(startIdx))
      break
    }
  }

  return chunks
}
