R script used for splitting by BOOK

# ---- 1. Set paths ----
input_path <- "D:\\Carleton\\DGAH110\\Confucius-Analects-20260219T191835Z-3-001\\Confucius-Analects\\Confucius-Analects.txt"
out_dir <- "D:\\Carleton\\DGAH110\\Confucius-Analects-20260219T191835Z-3-001\\Confucius-Analects"

# ---- 2. Read text (keep line breaks) ----
lines <- readLines(input_path, encoding = "UTF-8", warn = FALSE)

# ---- 3. Find BOOK headers (robust) ----
# Match "BOOK", also allowing any other words behind such as "BOOK VII. SHU R."
m <- regexec("^\\s*BOOK\\s+([IVXLC]+)\\b", lines, ignore.case = FALSE)
hits <- regmatches(lines, m)

book_idx <- which(lengths(hits) > 0)

# ---- 4. Helper: roman numeral to integer ----
roman_to_int <- function(r) {
  r <- toupper(trimws(r))
  if (!nzchar(r) || grepl("[^IVXLCDM]", r)) return(NA_integer_)

  vals <- c(I=1, V=5, X=10, L=50, C=100, D=500, M=1000)
  chars <- strsplit(r, "")[[1]]
  nums <- unname(vals[chars])

  # if unidentified, return NA
  if (any(is.na(nums))) return(NA_integer_)

  total <- 0L
  for (i in seq_along(nums)) {
    if (i < length(nums) && nums[i] < nums[i + 1]) total <- total - nums[i] else total <- total + nums[i]
  }
  total
}

# ---- 5. Create output dir ----
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

# ---- 6. Split and write ----
boundaries <- c(book_idx, length(lines) + 1)

for (k in seq_along(book_idx)) {
  start <- boundaries[k]
  end <- boundaries[k + 1] - 1

  # Applying Roman numbers
  roman <- hits[[book_idx[k]]][2]
  book_num <- roman_to_int(roman)

  if (is.na(book_num)) book_num <- k

  file_name <- sprintf("book_%02d.txt", book_num)
  file_path <- file.path(out_dir, file_name)

  writeLines(lines[start:end], con = file_path, useBytes = TRUE)
}

message("Finished:", normalizePath(out_dir))