library(shiny) |
library(udpipe) |
library(stringr) |
library(ggplot2) |
library(reshape2) |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe") |
ui <- fluidPage( |
titlePanel("French Readability and Cohesion Analyzer with UDPipe"), |
sidebarLayout( |
sidebarPanel( |
fileInput("corpus_zip", "Upload ZIP with TXT files (optional)", |
accept = c(".zip")), |
textAreaInput("text", "Or enter French text directly:", value = "", |
placeholder = "Type or paste French text here", |
width = '100%', height = '200px', resize = "both"), |
actionButton("analyze", "Analyze") |
), |
mainPanel( |
h3("Readability and Cohesion Features"), |
conditionalPanel( |
condition = "output.isCorpus == false", |
tableOutput("results") |
), |
conditionalPanel( |
condition = "output.isCorpus == true", |
plotOutput("corpusPlots") |
) |
) |
) |
) |
server <- function(input, output, session) { |
calculate_metrics <- function(text) { |
annotated <- udpipe_annotate(model, x = text) |
annotated_df <- as.data.frame(annotated) |
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ]) |
sentence_count <- length(unique(annotated_df$sentence_id)) |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x)))) |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0) |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0) |
sentence_ids <- unique(annotated_df$sentence_id) |
cohesion_values <- c() |
for (i in 2:length(sentence_ids)) { |
current_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i], "lemma"] |
previous_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i - 1], "lemma"] |
shared_words <- length(intersect(current_sentence, previous_sentence)) |
cohesion_values <- c(cohesion_values, shared_words / length(current_sentence)) |
} |
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0) |
text_words <- unique(annotated_df$lemma) |
text_sentence_cohesion <- sapply(sentence_ids, function(sid) { |
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"] |
shared_words <- length(intersect(sentence_words, text_words)) |
shared_words / length(sentence_words) |
}) |
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE) |
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count |
data.frame( |
"Word Count" = word_count, |
"Sentence Count" = sentence_count, |
"Syllable Count" = syllable_count, |
"Average Sentence Length" = round(avg_sentence_length, 2), |
"Average Syllables per Word" = round(avg_syllables_per_word, 2), |
"Sentence-to-Sentence Lexical Cohesion" = round(avg_sentence_to_sentence_cohesion, 2), |
"Text-to-Sentence Lexical Cohesion" = round(avg_text_to_sentence_cohesion, 2), |
"Type-Token Ratio" = round(type_token_ratio, 2) |
) |
} |
results <- eventReactive(input$analyze, { |
if (is.null(input$corpus_zip)) { |
text <- input$text |
if (nchar(text) > 0) { |
list(data = calculate_metrics(text), isCorpus = FALSE) |
} else { |
} |
} else { |
temp_dir <- tempdir() |
unzip(input$corpus_zip$datapath, exdir = temp_dir) |
txt_files <- list.files(temp_dir, pattern = "\\.txt$", full.names = TRUE) |
corpus_metrics <- list() |
n_files <- length(txt_files) |
withProgress(message = 'Analyzing corpus', value = 0, { |
for (i in seq_along(txt_files)) { |
text <- readLines(txt_files[i], warn = FALSE) |
corpus_metrics[[i]] <- calculate_metrics(paste(text, collapse = " ")) |
incProgress(1 / n_files) |
} |
}) |
corpus_metrics_df <- do.call(rbind, corpus_metrics) |
list(data = corpus_metrics_df, isCorpus = TRUE) |
} |
}) |
output$results <- renderTable({ |
if (!is.null(results()) && !results()$isCorpus) { |
results()$data |
} |
}) |
output$corpusPlots <- renderPlot({ |
if (!is.null(results()) && results()$isCorpus) { |
corpus_metrics_df <- results()$data |
melted_df <- melt(corpus_metrics_df) |
ggplot(melted_df, aes(x = variable, y = value)) + |
geom_boxplot() + |
facet_wrap(~ variable, scales = "free_y") + |
labs(x = NULL, y = "Value", title = "Corpus Analysis - Readability and Cohesion Metrics") + |
theme_minimal() + |
theme(axis.text.x = element_blank(), |
axis.ticks.x = element_blank()) |
} |
}) |
output$isCorpus <- reactive({ |
!is.null(results()) && results()$isCorpus |
}) |
outputOptions(output, "isCorpus", suspendWhenHidden = FALSE) |
} |
shinyApp(ui = ui, server = server) |