MiniAlsi / app.R
gloignon's picture
Update app.R
53181cb verified
library(shiny)
library(udpipe)
library(stringr)
library(ggplot2)
library(reshape2)
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
# Define UI for the application
ui <- fluidPage(
titlePanel("French Readability and Cohesion Analyzer with UDPipe"),
sidebarLayout(
sidebarPanel(
fileInput("corpus_zip", "Upload ZIP with TXT files (optional)",
accept = c(".zip")),
textAreaInput("text", "Or enter French text directly:", value = "",
placeholder = "Type or paste French text here",
width = '100%', height = '200px', resize = "both"),
actionButton("analyze", "Analyze")
),
mainPanel(
h3("Readability and Cohesion Features"),
conditionalPanel(
condition = "output.isCorpus == false",
tableOutput("results")
),
conditionalPanel(
condition = "output.isCorpus == true",
plotOutput("corpusPlots")
)
)
)
)
# Define server logic
server <- function(input, output, session) {
# Helper function to calculate metrics for a given text
calculate_metrics <- function(text) {
annotated <- udpipe_annotate(model, x = text)
annotated_df <- as.data.frame(annotated)
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
sentence_count <- length(unique(annotated_df$sentence_id))
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
sentence_ids <- unique(annotated_df$sentence_id)
cohesion_values <- c()
for (i in 2:length(sentence_ids)) {
current_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i], "lemma"]
previous_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i - 1], "lemma"]
shared_words <- length(intersect(current_sentence, previous_sentence))
cohesion_values <- c(cohesion_values, shared_words / length(current_sentence))
}
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
text_words <- unique(annotated_df$lemma)
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
shared_words <- length(intersect(sentence_words, text_words))
shared_words / length(sentence_words)
})
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
data.frame(
"Word Count" = word_count,
"Sentence Count" = sentence_count,
"Syllable Count" = syllable_count,
"Average Sentence Length" = round(avg_sentence_length, 2),
"Average Syllables per Word" = round(avg_syllables_per_word, 2),
"Sentence-to-Sentence Lexical Cohesion" = round(avg_sentence_to_sentence_cohesion, 2),
"Text-to-Sentence Lexical Cohesion" = round(avg_text_to_sentence_cohesion, 2),
"Type-Token Ratio" = round(type_token_ratio, 2)
)
}
# Reactive to handle single text or corpus input
results <- eventReactive(input$analyze, {
if (is.null(input$corpus_zip)) {
# Single text mode
text <- input$text
if (nchar(text) > 0) {
list(data = calculate_metrics(text), isCorpus = FALSE)
} else {
NULL
}
} else {
# Corpus mode: analyze each file in the uploaded ZIP
temp_dir <- tempdir()
unzip(input$corpus_zip$datapath, exdir = temp_dir)
txt_files <- list.files(temp_dir, pattern = "\\.txt$", full.names = TRUE)
corpus_metrics <- list()
n_files <- length(txt_files)
# Progress bar for corpus analysis
withProgress(message = 'Analyzing corpus', value = 0, {
for (i in seq_along(txt_files)) {
text <- readLines(txt_files[i], warn = FALSE)
corpus_metrics[[i]] <- calculate_metrics(paste(text, collapse = " "))
# Update progress bar
incProgress(1 / n_files)
}
})
# Combine metrics into a data frame
corpus_metrics_df <- do.call(rbind, corpus_metrics)
list(data = corpus_metrics_df, isCorpus = TRUE)
}
})
# Display results table for single text mode
output$results <- renderTable({
if (!is.null(results()) && !results()$isCorpus) {
results()$data
}
})
# Display box plots for corpus mode, using facets for individual scales
output$corpusPlots <- renderPlot({
if (!is.null(results()) && results()$isCorpus) {
corpus_metrics_df <- results()$data
melted_df <- melt(corpus_metrics_df)
ggplot(melted_df, aes(x = variable, y = value)) +
geom_boxplot() +
facet_wrap(~ variable, scales = "free_y") +
labs(x = NULL, y = "Value", title = "Corpus Analysis - Readability and Cohesion Metrics") +
theme_minimal() +
theme(axis.text.x = element_blank(), # Hide x-axis labels for individual boxes
axis.ticks.x = element_blank())
}
})
# Boolean for UI conditionals
output$isCorpus <- reactive({
!is.null(results()) && results()$isCorpus
})
outputOptions(output, "isCorpus", suspendWhenHidden = FALSE)
}
# Run the application
shinyApp(ui = ui, server = server)