R script used for splitting by BOOK
# ---- 1. Set paths ----
input_path <- "D:\\Carleton\\DGAH110\\Confucius-Analects-20260219T191835Z-3-001\\Confucius-Analects\\Confucius-Analects.txt"
out_dir <- "D:\\Carleton\\DGAH110\\Confucius-Analects-20260219T191835Z-3-001\\Confucius-Analects"
# ---- 2. Read text (keep line breaks) ----
lines <- readLines(input_path, encoding = "UTF-8", warn = FALSE)
# ---- 3. Find BOOK headers (robust) ----
# Match "BOOK", also allowing any other words behind such as "BOOK VII. SHU R."
m <- regexec("^\\s*BOOK\\s+([IVXLC]+)\\b", lines, ignore.case = FALSE)
hits <- regmatches(lines, m)
book_idx <- which(lengths(hits) > 0)
# ---- 4. Helper: roman numeral to integer ----
roman_to_int <- function(r) {
r <- toupper(trimws(r))
if (!nzchar(r) || grepl("[^IVXLCDM]", r)) return(NA_integer_)
vals <- c(I=1, V=5, X=10, L=50, C=100, D=500, M=1000)
chars <- strsplit(r, "")[[1]]
nums <- unname(vals[chars])
# if unidentified, return NA
if (any(is.na(nums))) return(NA_integer_)
total <- 0L
for (i in seq_along(nums)) {
if (i < length(nums) && nums[i] < nums[i + 1]) total <- total - nums[i] else total <- total + nums[i]
}
total
}
# ---- 5. Create output dir ----
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
# ---- 6. Split and write ----
boundaries <- c(book_idx, length(lines) + 1)
for (k in seq_along(book_idx)) {
start <- boundaries[k]
end <- boundaries[k + 1] - 1
# Applying Roman numbers
roman <- hits[[book_idx[k]]][2]
book_num <- roman_to_int(roman)
if (is.na(book_num)) book_num <- k
file_name <- sprintf("book_%02d.txt", book_num)
file_path <- file.path(out_dir, file_name)
writeLines(lines[start:end], con = file_path, useBytes = TRUE)
}
message("Finished:", normalizePath(out_dir))