In this guide, we will walk through the analysis of a dataset from Rivadavia.
This R Markdown file will generate a comprehensive HTML report that details each step of your analysis, including discussions that explain the purpose and findings of each chunk of code. If you have further requirements or specific points you’d like to focus on, feel free to ask!
# Clear the workspace
rm(list=ls())
#load packages needed
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(gridExtra)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.4
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(missMDA)
library(cluster)
## Warning: package 'cluster' was built under R version 4.0.5
library(vcd)
## Loading required package: grid
library(logistf)
library(MuMIn)
## Warning: package 'MuMIn' was built under R version 4.0.5
## Registered S3 method overwritten by 'MuMIn':
## method from
## nobs.logistf logistf
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.5
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
# Disable scientific notation globally
options(scipen = 999)
# Load the dataset
db <- read.csv(here::here("dbrivadavia.csv"))
# Remove rows with missing values(NA) in the first column
db <- subset(db, !is.na(db[,1]))
# List of categorical variables
cat_vars <- c("ID",
"Block",
"Unit",
"STATUS",
"INF",
"SITE",
"SPR",
"wall_material",
"wall_covering",
"roof_material",
"roof_covering",
"watertank",
"watertank_pigeon",
"palm",
"objects",
"insecticide",
"commercial",
"permethrin",
"other",
"travel_rural",
"dwelling_status",
"chicken_coop",
"rats",
"idcerca_NE",
"idcerca_NW",
"idcerca_S")
# Convert categorical variables to factors
db[cat_vars] <- lapply(db[cat_vars], as.factor)
# Convert the Date column from character to Date
db$DATE <- as.Date(db$DATE, format = "%d/%m/%Y")
db$SPR.DATE <- as.Date(db$SPR.DATE, format = "%d/%m/%Y")
#subset database to select only houses that have been inspected for triatomine infestations (EV)
dbEV <- subset(db,db$STATUS == "EV")
First, we visualize the categorical explanatory variables againts the infested status of the household (INF)
# Function to create stacked bar plots and ignore NAs
create_stacked_bar_plot <- function(data, categorical_var, target_var) {
# Remove rows with NA in the current categorical variable
data <- data[!is.na(data[[categorical_var]]), ]
ggplot(data, aes_string(x = target_var, fill = categorical_var)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = NULL, fill = categorical_var, x = "Infestation") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(size = 8),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7)
) +
ggtitle(categorical_var)
}
# Create list of categorical variables to visualize
categorical_vars <- setdiff(cat_vars, c(
"ID",
"Block",
"Unit",
"STATUS",
"INF",
"SITE",
"SPR",
"education_level",
"employer",
"employee",
"selfemployed",
"housework",
"not.employed",
"pension",
"benefit",
"health",
"idcerca_NE",
"idcerca_NW",
"idcerca_S"))
# Create a list to store the plots
cat_plots <- list()
# Convert INF variable to a factor with labels "NO" and "YES"
dbEV$INF_label <- factor(dbEV$INF, levels = c(0, 1), labels = c("NO", "YES"))
# Create plots for each remaining categorical variable and store in the list
for (var in categorical_vars) {
cat_plots[[var]] <- create_stacked_bar_plot(dbEV, var, "INF_label")
}
# Arrange the plots in a 4x4 grid
grid.arrange(grobs = cat_plots, ncol = 4, nrow = 4)
# Initialize an empty data frame to store results
contingency_results <- data.frame(Variable = character(),
Category = character(),
No_Infestation = numeric(),
Yes_Infestation = numeric(),
stringsAsFactors = FALSE)
# Perform Fisher's exact test and calculate proportions for each categorical variable
for (var in categorical_vars) {
# Create a contingency table
contingency_table <- table(dbEV[[var]], dbEV$INF)
# Calculate proportions
prop_table <- prop.table(contingency_table, margin = 2)
# Store results with proportions
for (level in rownames(contingency_table)) {
contingency_results <- rbind(contingency_results, data.frame(
Variable = var,
Category = level,
No_Infestation = prop_table[level, "0"],
Yes_Infestation = prop_table[level, "1"]
))
}
}
# Format and display the results as a table
contingency_results %>%
mutate(No_Infestation = scales::percent(No_Infestation, accuracy = 0.1),
Yes_Infestation = scales::percent(Yes_Infestation, accuracy = 0.1)) %>%
kable("html", col.names = c("Variable", "Category", "No Infestation", "Yes Infestation"), align = "c") %>%
kable_styling(full_width = FALSE, position = "left",
bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, color = "black") %>%
column_spec(2, color = "black") %>%
column_spec(3, color = "black") %>%
column_spec(4, color = "black") %>%
row_spec(0, bold = TRUE, color = "black")
Variable | Category | No Infestation | Yes Infestation |
---|---|---|---|
wall_material | 1 | 21.1% | 47.1% |
wall_material | 2 | 61.2% | 47.1% |
wall_material | 3 | 11.6% | 5.9% |
wall_material | 4 | 6.1% | 0.0% |
wall_covering | 0 | 8.9% | 30.8% |
wall_covering | 1 | 91.1% | 69.2% |
roof_material | 1 | 25.0% | 17.6% |
roof_material | 2 | 55.6% | 76.5% |
roof_material | 3 | 8.3% | 5.9% |
roof_material | 4 | 11.1% | 0.0% |
roof_covering | 0 | 50.0% | 80.0% |
roof_covering | 1 | 50.0% | 20.0% |
watertank | 0 | 11.7% | 16.7% |
watertank | 1 | 88.3% | 83.3% |
watertank_pigeon | 0 | 78.1% | 100.0% |
watertank_pigeon | 1 | 21.9% | 0.0% |
palm | 0 | 91.4% | 100.0% |
palm | 1 | 8.6% | 0.0% |
objects | 0 | 42.3% | 12.5% |
objects | 1 | 57.7% | 87.5% |
insecticide | 0 | 20.4% | 41.2% |
insecticide | 1 | 79.6% | 58.8% |
commercial | 0 | 53.1% | 50.0% |
commercial | 1 | 46.9% | 50.0% |
permethrin | 0 | 70.8% | 100.0% |
permethrin | 1 | 29.2% | 0.0% |
other | 0 | 64.6% | 50.0% |
other | 1 | 35.4% | 50.0% |
travel_rural | 0 | 53.1% | 66.7% |
travel_rural | 1 | 46.9% | 33.3% |
dwelling_status | 1 | 68.8% | 93.3% |
dwelling_status | 2 | 22.1% | 0.0% |
dwelling_status | 3 | 9.1% | 6.7% |
chicken_coop | 0 | 98.1% | 72.2% |
chicken_coop | 1 | 1.9% | 27.8% |
rats | 0 | 45.2% | 38.9% |
rats | 1 | 54.8% | 61.1% |
We perform Fisher’s exact test for each categorical variable to check
for significant associations with the INF
variable.
# Initialize an empty data frame to store results
fisher_results <- data.frame(Variable = character(),
p_value = numeric(),
stringsAsFactors = FALSE)
# Perform Fisher's exact test for each categorical variable
for (var in categorical_vars) {
test_result <- fisher.test(table(dbEV[[var]], dbEV$INF))
fisher_results <- rbind(fisher_results, data.frame(Variable = var, p_value = test_result$p.value))
}
# Display the results
fisher_results <- fisher_results %>%
mutate(Significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
))
# Format and display the results as a table
fisher_results %>%
mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
kable_styling(full_width = FALSE, position = "left",
bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, color = "black") %>%
column_spec(2, color = "black") %>%
column_spec(3, bold = TRUE, color = "red") %>%
row_spec(which(fisher_results$Significance != ""), bold = TRUE) %>%
row_spec(0, bold = TRUE, color = "black")
Variable | P-value | Significance |
---|---|---|
wall_material | 0.1474 | |
wall_covering | 0.0372 |
|
roof_material | 0.3866 | |
roof_covering | 0.0498 |
|
watertank | 0.4649 | |
watertank_pigeon | 0.0256 |
|
palm | 0.3651 | |
objects | 0.0288 |
|
insecticide | 0.0660 | |
commercial | 1.0000 | |
permethrin | 1.0000 | |
other | 1.0000 | |
travel_rural | 0.3248 | |
dwelling_status | 0.0893 | |
chicken_coop | 0.0003 | *** |
rats | 0.8029 |
Next, we visualize the numeric explanatory variables against the infested status of the household (INF)
# Function to create violin plots with jittered points
create_violin_plot <- function(data, numeric_var, target_var) {
# Remove rows with NA in the current numeric variable
data <- data[!is.na(data[[numeric_var]]), ]
ggplot(data, aes_string(x = target_var, y = numeric_var, fill = target_var)) +
geom_violin(trim = FALSE) +
geom_jitter(width = 0.2, size = 0.6, alpha = 0.6) +
labs(y = numeric_var, fill = target_var, x = target_var) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(size = 8),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7)
) +
ggtitle(numeric_var)
}
# Create list of numeric variables to visualize
numeric_var <- c("dog_num",
"cat_num",
"chicken_num",
"other_num",
"dist_lum",
"num_lum_50m",
"num_lum_100m",
"dist.infest",
"dist_NW",
"dist_S",
"num_infest_50m",
"num_infest_100m",
"num_S",
"num_S_infest_50m",
"num_S_infest_100m",
"num_NW",
"num_NW_infest_50m",
"num_NW_infest_100m",
"percent_green_50",
"percent_green_100")
# Create a list to store the plots
num_plots <- list()
# Create plots for each numeric variable and store in the list
for (var in numeric_var) {
num_plots[[var]] <- create_violin_plot(dbEV, var, "INF")
}
# Arrange the plots in a 4x4 grid
grid.arrange(grobs = num_plots, ncol = 5, nrow = 4)
We perform non-parametric Wilcox test for each numerical variable to
check for significant associations with the INF
variable.
# Initialize an empty data frame to store results
results <- data.frame(Variable = character(),
p_value = numeric(),
stringsAsFactors = FALSE)
# Perform Mann-Whitney test for each numeric variable
for (var in numeric_var) {
test_result <- wilcox.test(dbEV[[var]] ~ dbEV$INF, data = dbEV)
results <- rbind(results, data.frame(Variable = var, p_value = test_result$p.value))
}
# Display the results
results <- results %>%
mutate(Significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
))
# Format p-values to four decimal places
results$p_value <- formatC(results$p_value, format = "f", digits = 4)
# Format the table for Word document
results %>%
kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, color = "black") %>%
column_spec(2, color = "black") %>%
column_spec(3, bold = TRUE, color = "red") %>%
row_spec(which(results$Significance != ""), bold = TRUE) %>%
row_spec(0, bold = TRUE, color = "black")
Variable | P-value | Significance |
---|---|---|
dog_num | 0.2355 | |
cat_num | 0.4886 | |
chicken_num | 0.0000 | *** |
other_num | 0.0952 | |
dist_lum | 0.0410 |
|
num_lum_50m | 0.7524 | |
num_lum_100m | 0.0203 |
|
dist.infest | 0.0460 |
|
dist_NW | 0.5073 | |
dist_S | 0.7514 | |
num_infest_50m | 0.0000 | *** |
num_infest_100m | 0.0000 | *** |
num_S | 0.8242 | |
num_S_infest_50m | 0.0000 | *** |
num_S_infest_100m | 0.0000 | *** |
num_NW | 0.8073 | |
num_NW_infest_50m | 0.5132 | |
num_NW_infest_100m | 0.3501 | |
percent_green_50 | 0.2281 | |
percent_green_100 | 0.2740 |
We first explore the need to create new summary variable for animal hosts. As a first step, we did a correlation matrix with the number of animal hosts.
# Extract numeric variables related to animals
animal_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
# Compute the Spearman correlation matrix with significance
library(Hmisc)
cor_res <- rcorr(as.matrix(dbEV[animal_vars]), type = "spearman")
# Extract the correlation coefficients and p-values
cor_matrix <- cor_res$r
p_matrix <- cor_res$P
# Set the diagonal elements of both matrices to NA
diag(cor_matrix) <- NA
diag(p_matrix) <- NA
# Format the matrices with three decimal places and no scientific notation
formatted_cor_matrix <- formatC(cor_matrix, format = "f", digits = 3)
formatted_p_matrix <- formatC(p_matrix, format = "f", digits = 3)
# Convert matrices to data frames for better kable formatting
cor_df <- as.data.frame(formatted_cor_matrix)
p_df <- as.data.frame(formatted_p_matrix)
# Create row and column names for clarity
rownames(cor_df) <- colnames(cor_df) <- animal_vars
rownames(p_df) <- colnames(p_df) <- animal_vars
# Print the correlation matrix using kable
kable(cor_df, caption = "Spearman Correlation Matrix", align = "c") %>%
kable_styling(full_width = FALSE, position = "left",
bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1:ncol(cor_df), color = "black") %>%
row_spec(0, color = "black", bold = TRUE)
dog_num | cat_num | chicken_num | other_num | |
---|---|---|---|---|
dog_num | NA | 0.363 | 0.193 | 0.180 |
cat_num | 0.363 | NA | 0.008 | 0.015 |
chicken_num | 0.193 | 0.008 | NA | 0.227 |
other_num | 0.180 | 0.015 | 0.227 | NA |
# Print the p-value matrix using kable
kable(p_df, caption = "P-Value Matrix for Spearman Correlations", align = "c") %>%
kable_styling(full_width = FALSE, position = "left",
bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1:ncol(cor_df), color = "black") %>%
row_spec(0, color = "black", bold = TRUE)
dog_num | cat_num | chicken_num | other_num | |
---|---|---|---|---|
dog_num | NA | 0.000 | 0.012 | 0.021 |
cat_num | 0.000 | NA | 0.915 | 0.847 |
chicken_num | 0.012 | 0.915 | NA | 0.003 |
other_num | 0.021 | 0.847 | 0.003 | NA |
# Visualize the correlation matrix without the diagonal elements
corrplot(cor_matrix, method = "circle", na.label = " ")
Next we did a factor analysis of mixed data (FAMD) to include numeric and catergorical variables (presence of rats). We didn’t include presence of pigeons in water tanks since none of the infested houses had them.
# Extract the relevant variables, including categorical variables
famd_vars <- c("dog_num", "cat_num", "chicken_num", "other_num", "rats")
# Ensure the categorical variables are factors
dbEV[famd_vars] <- lapply(dbEV[famd_vars], function(x) if(is.character(x)) as.factor(x) else x)
# Impute missing values using imputeFAMD
imputed_data <- imputeFAMD(dbEV[famd_vars], ncp = 4) #ncp is the number of dimensions to use for imputation
# Perform FAMD on the imputed dataset
famd_res <- FAMD(imputed_data$completeObs, ncp = 4, graph = FALSE)
# Print the summary of FAMD results
summary(famd_res)
##
## Call:
## FAMD(base = imputed_data$completeObs, ncp = 4, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4
## Variance 1.534 1.052 0.954 0.785
## % of var. 30.685 21.032 19.078 15.690
## Cumulative % of var. 30.685 51.718 70.795 86.485
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 3 | 1.201 | 0.397 0.051 0.109 | 0.043 0.001 0.001 | -0.072
## 6 | 1.589 | 0.944 0.289 0.353 | 0.030 0.000 0.000 | -0.139
## 7 | 1.498 | -1.469 0.699 0.961 | 0.087 0.004 0.003 | 0.101
## 8 | 1.450 | -1.395 0.631 0.925 | -0.006 0.000 0.000 | 0.089
## 9 | 1.498 | -1.469 0.699 0.961 | 0.087 0.004 0.003 | 0.101
## 10 | 1.450 | -1.395 0.631 0.925 | -0.006 0.000 0.000 | 0.089
## 11 | 1.498 | -1.469 0.699 0.961 | 0.087 0.004 0.003 | 0.101
## 24 | 1.474 | -0.114 0.004 0.006 | -0.192 0.018 0.017 | -0.019
## 25 | 1.487 | -1.409 0.644 0.898 | 0.189 0.017 0.016 | 0.288
## 26 | 1.201 | 0.397 0.051 0.109 | 0.043 0.001 0.001 | -0.072
## ctr cos2
## 3 0.003 0.004 |
## 6 0.010 0.008 |
## 7 0.005 0.005 |
## 8 0.004 0.004 |
## 9 0.005 0.005 |
## 10 0.004 0.004 |
## 11 0.005 0.005 |
## 24 0.000 0.000 |
## 25 0.043 0.037 |
## 26 0.003 0.004 |
##
## Continuous variables
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## dog_num | 0.757 37.339 0.573 | -0.015 0.022 0.000 | -0.073 0.552
## cat_num | 0.537 18.775 0.288 | -0.562 29.983 0.315 | -0.069 0.498
## chicken_num | 0.242 3.814 0.059 | 0.727 50.205 0.528 | -0.555 32.316
## other_num | 0.322 6.764 0.104 | 0.456 19.773 0.208 | 0.796 66.446
## cos2
## dog_num 0.005 |
## cat_num 0.005 |
## chicken_num 0.308 |
## other_num 0.634 |
##
## Categories
## Dim.1 ctr cos2 v.test Dim.2 ctr cos2 v.test
## 0 | -0.847 15.909 0.685 -10.110 | 0.014 0.009 0.000 0.195
## 1 | 0.926 17.400 0.685 10.110 | -0.015 0.009 0.000 -0.195
## Dim.3 ctr cos2 v.test
## 0 | 0.040 0.090 0.001 0.598 |
## 1 | -0.043 0.098 0.001 -0.598 |
Visualize the FAMD results
# Visualize the FAMD results in the first 2 dimensions
fviz_famd_var(famd_res, repel = TRUE) +
labs(title = "Variables factor map")
# Visualize the contributions of quantitative variables
fviz_famd_var(famd_res, choice = "quanti.var", repel = TRUE) +
labs(title = "Quantitative Variables factor map")
# Modify INF labels to be more descriptive
dbEV$INF_label <- factor(dbEV$INF, levels = c(0, 1), labels = c("Not Infested", "Infested"))
# Visualize the FAMD results with individuals grouped by infestation status
fviz_famd(famd_res,
habillage = dbEV$INF_label, # Use descriptive labels for coloring
palette = c("blue", "red"), # Custom colors: "Not Infested" (blue), "Infested" (red)
repel = TRUE, # Avoid overlapping text labels
addEllipses = TRUE, # Add confidence ellipses
label = "none") + # Disable individual labels
labs(title = "FAMD - Individuals grouped by infestation",
color = "Infestation Status", fill = "Infestation Status") + # Set a single legend title
theme(legend.position = "right")
Explore the use of PCA for the animal hosts using the numeric variables only
# Extract the quantitative variables for PCA
quant_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
quant_data <- dbEV[quant_vars]
# Ensure the data does not contain NA values
quant_data <- na.omit(quant_data)
# Perform PCA
pca_res <- prcomp(quant_data, scale. = TRUE)
# Print summary of PCA results
summary(pca_res)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.1253 1.0276 0.9770 0.8505
## Proportion of Variance 0.3166 0.2640 0.2386 0.1808
## Cumulative Proportion 0.3166 0.5806 0.8192 1.0000
# Extract the quantitative variables for PCA
quant_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
quant_data <- dbEV[quant_vars]
# Impute missing values using missMDA's PCA-based method
# Estimate the number of dimensions to keep for imputation
ncp <- estim_ncpPCA(quant_data, scale = TRUE)$ncp
# Perform the imputation
imputed_data <- imputePCA(quant_data, ncp = ncp, scale = TRUE)$completeObs
# Perform PCA on the imputed data
pca_res <- prcomp(imputed_data, scale. = TRUE)
# Print summary of PCA results
summary(pca_res)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.1238 1.0262 0.9770 0.8541
## Proportion of Variance 0.3157 0.2633 0.2386 0.1824
## Cumulative Proportion 0.3157 0.5790 0.8176 1.0000
# Extract PCA coordinates (Principal Components)
pca_coords <- as.data.frame(pca_res$x)
# Rename the columns for clarity (e.g., PCA_Dim1, PCA_Dim2, etc.)
colnames(pca_coords) <- paste0("PCA_animal_Dim", 1:ncol(pca_coords))
# Add PCA dimensions back to the original dataset
dbEV <- cbind(dbEV, pca_coords)
Visualize the PCA results and evaluate association with infestation
# Plot variables
fviz_pca_var(pca_res, repel = TRUE) +
labs(title = "PCA - Variables")
# Plot individuals with custom colors and descriptive labels in the legend
fviz_pca_ind(pca_res,
geom.ind = "point", # Plot individuals as points
habillage = dbEV$INF_label, # Use descriptive labels for coloring
addEllipses = TRUE, # Add confidence ellipses
palette = c("blue", "red"), # Custom colors: "Not Infested" (blue), "Infested" (red)
legend.title = "Infestation Status", # Set a single legend title
label = "none") + # Disable individual labels
scale_color_manual(values = c("Not Infested" = "blue", "Infested" = "red")) + # Ensure consistent color mapping
theme(legend.position = "right") # Position legend on the right
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
# Define all variables for Mann-Whitney U tests
all_vars <- c(colnames(pca_coords))
# Remove variables containing "NA"
all_vars <- subset(all_vars, !is.na(all_vars))
# Initialize an empty data frame to store results
results <- data.frame(Variable = character(),
p_value = numeric(),
Significance = character(),
stringsAsFactors = FALSE)
# Perform Mann-Whitney test for each variable
for (var in all_vars) {
test_result <- wilcox.test(dbEV[[var]] ~ dbEV$INF, data = dbEV)
results <- rbind(results, data.frame(Variable = var,
p_value = formatC(test_result$p.value, format = "f", digits = 4),
Significance = ifelse(test_result$p.value < 0.05, ifelse(test_result$p.value < 0.01, ifelse(test_result$p.value < 0.001, "***", "**"), "*"), "")))
}
# Format p-values to four decimal places
results$p_value <- formatC(results$p_value, format = "f", digits = 4)
# Format the table for Word document
results %>%
kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, color = "black") %>%
column_spec(2, color = "black") %>%
column_spec(3, bold = TRUE, color = "red") %>%
row_spec(which(results$Significance != ""), bold = TRUE) %>%
row_spec(0, bold = TRUE, color = "black")
Variable | P-value | Significance |
---|---|---|
PCA_animal_Dim1 | 0.0233 |
|
PCA_animal_Dim2 | 0.3070 | |
PCA_animal_Dim3 | 0.0159 |
|
PCA_animal_Dim4 | 0.0591 |
The PCA is slightly better (explains more variance so we will proceed that). The first two dimensions of the PCA that explain 31.6% and 26.3%, respectively. The first dimension separates the households with a larger number of animal hosts and the second dimension distinguishes those in particular that have a larger number of chickens and other animals, not just companion animals (dogs and cats). The third dimension is associated with the number of chicken. Since we have 4 dimensions in total (given that we have 4 variables), the first two dimensions explain ~50%, and there is a significant correlation between the numbers of dogs and cats, the animal host variables selected for the infestation models were the number of dogs and the number of chickens. However, we will also try the PCA dim 1 in the model as an alternative to the total number of animal hosts.
We used cluster analyses to build “house type” variable based on the construction materials (roof and wall materials and characteristics). We first did a multiple correspondence analyses (MCA), to sumarize the categorical variables in orthogonal numeric variables to be used in the cluster analyses.
# Define the variables related to house type
house_vars <- c("wall_material", "wall_covering", "roof_material", "roof_covering")
# Ensure the variables are treated as factors
dbEV[house_vars] <- lapply(dbEV[house_vars], as.factor)
# Impute missing values
imputed_data <- imputeMCA(dbEV[house_vars], ncp = 3)
# Perform Multiple Correspondence Analysis (MCA) on imputed data
mca_res <- MCA(imputed_data$completeObs, graph = TRUE)
# Extract MCA coordinates for individuals
mca_coords <- as.data.frame(mca_res$ind$coord)
colnames(mca_coords) <- paste0("MCA_Dim", 1:ncol(mca_coords))
# Combine the original data frame with the MCA results
dbEV <- cbind(dbEV, mca_coords)
# Create a scree plot to visualize the variance explained by each dimension
fviz_screeplot(mca_res, addlabels = TRUE, ylim = c(0, 100)) +
labs(title = "Scree Plot - Variance Explained by Each MCA Dimension",
x = "Dimensions", y = "Percentage of Variance Explained")
We then performed a hierarchical clustering analyses with the first 5 dimensions (~77% of the variance)
# Perform hierarchical clustering
diss_matrix <- daisy(mca_coords, metric = "euclidean")
hc_res <- hclust(diss_matrix, method = "ward.D2")
# Plot the dendrogram
plot(hc_res, labels = FALSE, main = "Hierarchical Clustering Dendrogram")
# Cut the dendrogram to obtain 3 clusters
clusters <- cutree(hc_res, k = 3)
# Create a table summarizing the clusters
cluster_table <- data.frame(Cluster = clusters, dbEV[house_vars])
print("Cluster summary table:")
## [1] "Cluster summary table:"
print(table(cluster_table$Cluster))
##
## 1 2 3
## 128 40 33
# Add the cluster assignments as a new categorical variable to dbEV
dbEV$house_type <- as.factor(clusters)
Next, we visualized the clusters (type of house) based on the MCA variables, and charectized the clusters based on the construction characteristics of the houses.
# Plot the individuals colored by house_type
fviz_mca_ind(mca_res,
label = "none", # Hide individual labels
habillage = dbEV$house_type, # Color by house_type
addEllipses = TRUE, # Add confidence ellipses
ellipse.level = 0.95, # Confidence level
palette = "jco") # Color palette
# Plot the variables
fviz_mca_var(mca_res,
repel = TRUE, # Avoid text overlapping
col.var = "black") # Color of variables
# Plot a biplot showing both individuals and variables colored by house_type
fviz_mca_biplot(mca_res,
repel = TRUE, # Avoid text overlapping
col.var = "black", # Color of variables
habillage = dbEV$house_type, # Color by house_type
palette = "jco") # Color palette
## Warning: ggrepel: 162 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Create stacked bar plots for each house variable without NAs
plot_list <- lapply(house_vars, function(var) {
ggplot(na.omit(dbEV[, c(var, "house_type")]), aes_string(x = var, fill = "house_type")) +
geom_bar(position = "fill") +
labs(y = "Proportion", fill = "House Type") +
theme_minimal() +
ggtitle(paste("Distribution of", var))
})
# Arrange the plots in a grid
do.call(grid.arrange, c(plot_list, ncol = 2))
# Create stacked bar plots for each house variable with house_type on the x-axis and without NAs
plot_list <- lapply(house_vars, function(var) {
ggplot(na.omit(dbEV[, c(var, "house_type")]), aes_string(x = "house_type", fill = var)) +
geom_bar(position = "fill") +
labs(y = "Proportion", fill = var) +
theme_minimal() +
ggtitle(paste("Distribution of", var, "by House Type"))
})
# Arrange the plots in a grid
do.call(grid.arrange, c(plot_list, ncol = 2))
Hacer: aca escribir las caracteristicas de las casas y caracterizar los clusters
# Define the additional variables for MCA
additional_vars <- c("education_level", "pension", "benefit", "health")
# Recode health category 4 as 3, and specify .default to handle other values
dbEV$health <- recode(as.numeric(dbEV$health), `4` = 2, `3` = 2, .default = as.numeric(dbEV$health))
# Recode education_level categories 4 and 5 as 3, and specify .default to handle other values
dbEV$education_level <- recode(as.numeric(dbEV$education_level), `4` = 3, `5` = 3, .default = as.numeric(dbEV$education_level))
# Create the income_work variable
dbEV$income_work <- with(dbEV, ifelse(employer == 1 | employee == 1 | selfemployed == 1, "income_work",
ifelse(housework == 1 | not.employed == 1, "not_income_work", NA)))
# Ensure the variables are treated as factors
dbEV[additional_vars] <- lapply(dbEV[additional_vars], as.factor)
dbEV$income_work <- as.factor(dbEV$income_work)
# Create frequency tables for each variable
health_table <- as.data.frame(table(dbEV$health))
education_table <- as.data.frame(table(dbEV$education_level))
benefit_table <- as.data.frame(table(dbEV$benefit))
pension_table <- as.data.frame(table(dbEV$pension))
income_work_table <- as.data.frame(table(dbEV$income_work))
# Rename columns for clarity
colnames(health_table) <- c("Category", "Health_Freq")
colnames(education_table) <- c("Category", "Education_Freq")
colnames(benefit_table) <- c("Category", "Benefit_Freq")
colnames(pension_table) <- c("Category", "Pension_Freq")
colnames(income_work_table) <- c("Category", "Income_Work_Freq")
# Merge the tables by "Category" with all data included (full join)
combined_table <- Reduce(function(x, y) merge(x, y, by = "Category", all = TRUE),
list(health_table, education_table, benefit_table, pension_table, income_work_table))
# Display the combined table using kable
kable(combined_table, caption = "Frequency Tables for Health, Education Level, Benefit, Pension, and Income Work", align = "c") %>%
kable_styling(full_width = FALSE, position = "left",
bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(0, bold = TRUE, color = "black") %>%
column_spec(1:ncol(combined_table), color = "black")
Category | Health_Freq | Education_Freq | Benefit_Freq | Pension_Freq | Income_Work_Freq |
---|---|---|---|---|---|
1 | 45 | 37 | 27 | 82 | NA |
2 | 112 | 78 | NA | NA | NA |
3 | NA | 51 | NA | NA | NA |
0 | NA | NA | 108 | 76 | NA |
income_work | NA | NA | NA | NA | 124 |
not_income_work | NA | NA | NA | NA | 35 |
Next, we did an MCA with the categorical variables to summarize the socio-demographic variables
# Combine income_work with additional variables for MCA
mca_vars <- c(additional_vars, "income_work")
# Check for missing values in the combined data
missing_values <- sapply(dbEV[mca_vars], function(x) sum(is.na(x)))
print("Missing values in each variable:")
## [1] "Missing values in each variable:"
print(missing_values)
## education_level pension benefit health income_work
## 35 43 66 44 42
# Impute missing values
imputed_data <- imputeMCA(dbEV[mca_vars], ncp = 3)
# Perform Multiple Correspondence Analysis (MCA) on imputed data
mca_res <- MCA(imputed_data$completeObs, graph = TRUE)
# Extract MCA coordinates for individuals
mca_coords <- as.data.frame(mca_res$ind$coord)
colnames(mca_coords) <- paste0("MCA_SEE_Dim", 1:ncol(mca_coords))
# Combine the original data frame with the MCA results
dbEV <- cbind(dbEV, mca_coords)
We then visualized the results of the MCA in more detail.
# Visualize the contribution of variables to the first dimension
fviz_contrib(mca_res, choice = "var", axes = 1, top = 10)
# Visualize the contribution of variables to the second dimension
fviz_contrib(mca_res, choice = "var", axes = 2, top = 10)
# Visualize the biplot of the MCA results
fviz_mca_biplot(mca_res,
repel = TRUE, # Avoid text overlapping
col.var = "black", # Color of variables
habillage = "none", # Color by income_work
palette = "jco", # Color palette
addEllipses = FALSE, # Add confidence ellipses
ellipse.level = 0.95) # Confidence level
## Warning: ggrepel: 146 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Get the variance explained by each dimension
eigenvalues <- mca_res$eig
print("Eigenvalues and variance explained by each dimension:")
## [1] "Eigenvalues and variance explained by each dimension:"
print(eigenvalues)
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 0.2983585 24.86321 24.86321
## dim 2 0.2288132 19.06777 43.93097
## dim 3 0.1981309 16.51090 60.44188
## dim 4 0.1924189 16.03491 76.47679
## dim 5 0.1566999 13.05833 89.53512
## dim 6 0.1255786 10.46488 100.00000
We then performed a hierarchical cluster of the households based on the first 4 dimensions (~60% of the variance).
# Extract the first three dimensions of the MCA coordinates for individuals
mca_coords <- as.data.frame(mca_res$ind$coord[, 1:3])
colnames(mca_coords) <- paste0("MCA_SEE_Dim", 1:3)
# Perform hierarchical clustering on the first three dimensions
diss_matrix <- dist(mca_coords) # Compute dissimilarity matrix
hc_res <- hclust(diss_matrix, method = "ward.D2")
# Plot the dendrogram
plot(hc_res, labels = FALSE, main = "Hierarchical Clustering Dendrogram")
# Cut the dendrogram to obtain clusters (e.g., 3 clusters)
num_clusters <- 3
clusters <- cutree(hc_res, k = num_clusters)
# Add the cluster results back to the original data frame
dbEV$SEE <- as.factor(clusters)
table(dbEV$SEE)
##
## 1 2 3
## 107 60 34
Finally, we visualized the clusters based on the MCA results for interpretation
# Visualize the biplot of the MCA results with clusters
fviz_mca_biplot(mca_res,
repel = TRUE, # Avoid text overlapping
col.var = "black", # Color of variables
habillage = dbEV$SEE, # Color by clusters
palette = "jco", # Color palette
addEllipses = TRUE, # Add confidence ellipses
ellipse.level = 0.95) # Confidence level
## Warning: ggrepel: 159 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Create stacked bar plots for each house variable without NAs
plot_list <- lapply(mca_vars, function(var) {
ggplot(na.omit(dbEV[, c(var, "SEE")]), aes_string(x = var, fill = "SEE")) +
geom_bar(position = "fill") +
labs(y = "Proportion", fill = "SEE") +
theme_minimal() +
ggtitle(paste("Distribution of", var))
})
# Arrange the plots in a grid
do.call(grid.arrange, c(plot_list, ncol = 2))
# Create stacked bar plots for each house variable with house_type on the x-axis and without NAs
plot_list <- lapply(mca_vars, function(var) {
ggplot(na.omit(dbEV[, c(var, "SEE")]), aes_string(x = "SEE", fill = var)) +
geom_bar(position = "fill") +
labs(y = "Proportion", fill = var) +
theme_minimal() +
ggtitle(paste("Distribution of", var, "by SEE"))
})
# Arrange the plots in a grid
do.call(grid.arrange, c(plot_list, ncol = 2))
Hacer: interpretacion de SEE - cambiar SEE por SES
dbEV$dwelling_status <- as.factor(dbEV$dwelling_status)
# Create a contingency table between the cluster (SEE) and dwelling_status
contingency_table <- table(SEE = dbEV$SEE, DwellingStatus = dbEV$dwelling_status)
# Print the contingency table with variable titles and margins
print("Contingency Table:")
## [1] "Contingency Table:"
contingency_table_with_margins <- addmargins(contingency_table)
print(ftable(contingency_table_with_margins))
## DwellingStatus 1 2 3 Sum
## SEE
## 1 62 12 5 79
## 2 35 19 5 59
## 3 23 3 5 31
## Sum 120 34 15 169
# Perform the chi-square test of independence
chisq_test <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Print the results of the chi-square test
print("Chi-square Test Results:")
## [1] "Chi-square Test Results:"
print(chisq_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 11.142, df = 4, p-value = 0.02501
# Convert the contingency table to a data frame for ggplot2
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("SEE", "DwellingStatus", "Count")
# Create a bar plot of the proportions
ggplot(contingency_df, aes(x = SEE, y = Count, fill = DwellingStatus)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "Dwelling Status") +
ggtitle("Proportions of Dwelling Status within SEE Groups") +
theme_minimal()
# Create a mosaic plot of the contingency table
mosaic(~SEE + DwellingStatus, data = contingency_table,
shade = TRUE, legend = TRUE,
main = "Mosaic Plot of Dwelling Status by SEE")
dbEV$house_type <- as.factor(dbEV$house_type)
# Create a contingency table between the cluster (SEE) and dwelling_status
contingency_table <- table(SEE = dbEV$SEE, HouseType = dbEV$house_type)
# Print the contingency table with variable titles and margins
print("Contingency Table:")
## [1] "Contingency Table:"
contingency_table_with_margins <- addmargins(contingency_table)
print(ftable(contingency_table_with_margins))
## HouseType 1 2 3 Sum
## SEE
## 1 74 14 19 107
## 2 30 20 10 60
## 3 24 6 4 34
## Sum 128 40 33 201
# Perform the chi-square test of independence
chisq_test <- chisq.test(contingency_table)
# Print the results of the chi-square test
print("Chi-square Test Results:")
## [1] "Chi-square Test Results:"
print(chisq_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 11.115, df = 4, p-value = 0.0253
# Convert the contingency table to a data frame for ggplot2
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("SEE", "HouseType", "Count")
# Create a bar plot of the proportions
ggplot(contingency_df, aes(x = SEE, y = Count, fill = HouseType)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "HouseType") +
ggtitle("Proportions of House Type within SEE Groups") +
theme_minimal()
# Create a mosaic plot of the contingency table
mosaic(~SEE + HouseType, data = contingency_table,
shade = TRUE, legend = TRUE,
main = "Mosaic Plot of House Type by SEE")
dbEV$insecticide <- as.factor(dbEV$insecticide)
# Create a contingency table between the cluster (SEE) and dwelling_status
contingency_table <- table(SEE = dbEV$SEE, Insecticide = dbEV$insecticide)
# Print the contingency table with variable titles and margins
print("Contingency Table:")
## [1] "Contingency Table:"
contingency_table_with_margins <- addmargins(contingency_table)
print(ftable(contingency_table_with_margins))
## Insecticide 0 1 Sum
## SEE
## 1 20 61 81
## 2 11 48 59
## 3 8 26 34
## Sum 39 135 174
# Perform the chi-square test of independence
chisq_test <- chisq.test(contingency_table)
# Print the results of the chi-square test
print("Chi-square Test Results:")
## [1] "Chi-square Test Results:"
print(chisq_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 0.74809, df = 2, p-value = 0.6879
# Convert the contingency table to a data frame for ggplot2
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("SEE", "Insecticide", "Count")
# Create a bar plot of the proportions
ggplot(contingency_df, aes(x = SEE, y = Count, fill = Insecticide)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "Insecticide") +
ggtitle("Proportions of House Type within SEE Groups") +
theme_minimal()
# Create a mosaic plot of the contingency table
mosaic(~SEE + Insecticide, data = contingency_table,
shade = TRUE, legend = TRUE,
main = "Mosaic Plot of Insecticide use by SEE")
Lastly, we conducted 3 models to evaluate infestation status:
# Define the additional categorical variables
categorical_var <- c("SEE", "house_type")
# Initialize an empty data frame to store results
results_chisq <- data.frame(Variable = character(),
p_value = numeric(),
stringsAsFactors = FALSE)
# Perform Chi-square test for each categorical variable
for (var in categorical_var) {
# Remove rows with NA values in the specific variables
valid_data <- dbEV %>% select(all_of(var), INF) %>% na.omit()
# Create contingency table
contingency_table <- table(valid_data[[var]], valid_data$INF)
# Perform chi-square test
test_result <- chisq.test(contingency_table)
# Store results
results_chisq <- rbind(results_chisq, data.frame(Variable = var, p_value = test_result$p.value))
}
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Display the results
results_chisq <- results_chisq %>%
mutate(Significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
))
# Format p-values to four decimal places
results_chisq$p_value <- formatC(results_chisq$p_value, format = "f", digits = 4)
# Format the table for Word document
formatted_table_chisq <- results_chisq %>%
kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = TRUE, color = "black") %>%
column_spec(2, color = "black") %>%
column_spec(3, bold = TRUE, color = "black") %>%
row_spec(0, color = "black", bold = TRUE)
# Print the formatted table
formatted_table_chisq
Variable | P-value | Significance |
---|---|---|
SEE | 0.8551 | |
house_type | 0.5294 |
The SES and house type variables were not associated to infestation status of the house. Next, we ran the model
#select variables
dbEV_mod1 <- select(dbEV, INF, SEE, house_type, chicken_coop, objects, chicken_num, dog_num, wall_covering, roof_covering, PCA_animal_Dim1)
#Model 1a: with the number of dogs and the presence of chicken coop (instead of the number of chicken)
dbEV_mod1$house_type <- relevel(factor(dbEV_mod1$house_type), ref = "2")
fit1<-logistf(data = dbEV_mod1, INF ~ house_type + chicken_coop + objects + dog_num, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
summary(fit1)
## logistf(formula = INF ~ house_type + chicken_coop + objects +
## dog_num, data = dbEV_mod1, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.omit)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -3.24352962 0.7881712 -5.1730364 -1.8903445 34.09248672
## house_type1 0.24816084 0.7783504 -1.2421891 2.0457512 0.09688231
## house_type3 -0.27110625 0.9818707 -2.3557155 1.8109296 0.07073390
## chicken_coop1 2.30663445 0.8016294 0.7320637 3.9780758 7.96858870
## objects1 0.96062723 0.7314035 -0.4388935 2.7012503 1.74003014
## dog_num 0.02967268 0.1725631 -0.3610981 0.3667029 0.02596800
## p method
## (Intercept) 0.000000005255379 2
## house_type1 0.755603554782079 2
## house_type3 0.790271207719516 2
## chicken_coop1 0.004759603338522 2
## objects1 0.187135007376231 2
## dog_num 0.871978488250774 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=13.99973 on 5 df, p=0.01561111, n=158
## Wald test = 64.29003 on 5 df, p = 0.000000000001573075
#Model 1b: with the PCAdim1 to indicate the number of total animal hosts instead of the number of dogs.
fit1b<-logistf(data = dbEV_mod1, INF ~ house_type + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
summary(fit1b)
## logistf(formula = INF ~ house_type + chicken_coop + objects +
## PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.omit)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -3.1734905 0.7881777 -5.1055405 -1.8227946 32.88584105
## house_type1 0.2312874 0.7610383 -1.2115518 2.0057562 0.08812514
## house_type3 -0.3098004 0.9632120 -2.3486030 1.7374288 0.09634008
## chicken_coop1 2.0054308 0.8290798 0.3629840 3.7279144 5.64760519
## objects1 0.9473739 0.7112456 -0.3975415 2.6597209 1.82655983
## PCA_animal_Dim1 0.2015658 0.2078830 -0.2777670 0.6041351 0.76388754
## p method
## (Intercept) 0.000000009773249 2
## house_type1 0.766574323659760 2
## house_type3 0.756266691160554 2
## chicken_coop1 0.017479227293958 2
## objects1 0.176534422222332 2
## PCA_animal_Dim1 0.382114659173373 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=14.9768 on 5 df, p=0.01046193, n=164
## Wald test = 65.68222 on 5 df, p = 0.0000000000008090195
#Model 1c: this model replaces the house type with wall and roof covering which were signifficant in the bivariate analyses.
fit1c<-logistf(data = dbEV_mod1, INF ~ wall_covering + roof_covering + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
summary(fit1c)
## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop +
## objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.omit)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -2.0414706 1.0269912 -4.6838513 -0.1764189 4.7023509
## wall_covering1 -1.7176185 0.8965726 -3.5838011 0.2544595 2.9861507
## roof_covering1 -0.3327281 0.8327502 -2.2802046 1.4376450 0.1376429
## chicken_coop1 3.8189131 1.3448303 1.3323207 7.1217844 9.5952001
## objects1 1.2457615 0.9313826 -0.5839831 3.6281952 1.7241705
## PCA_animal_Dim1 -0.5015937 0.4549452 -1.7308895 0.3717814 1.1538464
## p method
## (Intercept) 0.030121390 2
## wall_covering1 0.083979577 2
## roof_covering1 0.710635645 2
## chicken_coop1 0.001950867 2
## objects1 0.189157097 2
## PCA_animal_Dim1 0.282745410 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=16.87287 on 5 df, p=0.004747313, n=104
## Wald test = 37.29731 on 5 df, p = 0.0000005221187
# Calculate the AIC for both models
aic_fit1 <- AIC(fit1)
aic_fit1b <- AIC(fit1b)
aic_fit1c <- AIC(fit1c)
# Print the AIC values
print(paste("AIC for Model 1a (with dog_num):", aic_fit1))
## [1] "AIC for Model 1a (with dog_num): 98.390439882464"
print(paste("AIC for Model 1b (with PCA_animal_Dim1):", aic_fit1b))
## [1] "AIC for Model 1b (with PCA_animal_Dim1): 104.463977194883"
print(paste("AIC for Model 1c (with wall/roof covering):", aic_fit1c))
## [1] "AIC for Model 1c (with wall/roof covering): 64.2754498199873"
# Note: VIF computation with logistf is not directly supported,
# so we use a standard logistic regression model to compute VIF
# Fit a standard logistic regression model for VIF calculation
fit_standard <- glm(INF ~ wall_covering + roof_covering + chicken_coop + objects + dog_num,
data = dbEV_mod1,
family = binomial,
na.action = na.omit)
# Compute VIF using the car package
vif_values <- car::vif(fit_standard)
# Display the VIF values
print(vif_values)
## wall_covering roof_covering chicken_coop objects dog_num
## 1.329212 1.213121 2.298461 1.138195 2.097412
#Multi-model inference and model averaging
dbEV_mod1 <- select(dbEV, INF, chicken_coop, objects, wall_covering, roof_covering, PCA_animal_Dim1)
dbEV_mod1 <- dbEV_mod1[complete.cases(dbEV_mod1), ]
table(dbEV$INF)
##
## 0 1
## 181 20
fit1c<-logistf(data = dbEV_mod1, INF ~ wall_covering + roof_covering + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
summary(fit1c)
## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop +
## objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.fail)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -2.0414706 1.0269912 -4.6838513 -0.1764189 4.7023509
## wall_covering1 -1.7176185 0.8965726 -3.5838011 0.2544595 2.9861507
## roof_covering1 -0.3327281 0.8327502 -2.2802046 1.4376450 0.1376429
## chicken_coop1 3.8189131 1.3448303 1.3323207 7.1217844 9.5952001
## objects1 1.2457615 0.9313826 -0.5839831 3.6281952 1.7241705
## PCA_animal_Dim1 -0.5015937 0.4549452 -1.7308895 0.3717814 1.1538464
## p method
## (Intercept) 0.030121390 2
## wall_covering1 0.083979577 2
## roof_covering1 0.710635645 2
## chicken_coop1 0.001950867 2
## objects1 0.189157097 2
## PCA_animal_Dim1 0.282745410 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=16.87287 on 5 df, p=0.004747313, n=104
## Wald test = 37.29731 on 5 df, p = 0.0000005221187
MMI <- dredge(fit1c)
## Fixed term is "(Intercept)"
MMI
## Global model call: logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop +
## objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.fail)
## ---
## Model selection table
## (Int) chc_cop obj PCA_anm_Dm1 rof_cvr wll_cvr df logLik AICc delta weight
## 5 -2.380 0.23660 2 -26.133 56.4 0.00 0.147
## 1 -2.429 1 -27.192 56.4 0.04 0.144
## 13 -1.986 0.20380 + 3 -25.778 57.8 1.41 0.072
## 9 -2.010 + 2 -26.852 57.8 1.44 0.072
## 3 -3.476 + 2 -26.853 57.8 1.44 0.071
## 7 -3.414 + 0.05883 3 -25.815 57.9 1.49 0.070
## 21 -1.107 0.29820 + 3 -26.303 58.8 2.46 0.043
## 17 -1.224 + 2 -27.380 58.9 2.49 0.042
## 11 -3.064 + + 3 -26.524 59.3 2.90 0.034
## 15 -3.001 + 0.07994 + 4 -25.468 59.3 2.95 0.034
## 2 -2.844 + 2 -27.700 59.5 3.13 0.031
## 6 -2.965 + -0.47970 3 -26.697 59.6 3.25 0.029
## 19 -2.210 + + 3 -27.020 60.3 3.89 0.021
## 29 -0.958 0.27850 + + 4 -25.948 60.3 3.92 0.021
## 25 -1.072 + + 3 -27.036 60.3 3.93 0.021
## 23 -2.103 + 0.13560 + 4 -25.966 60.3 3.95 0.020
## 10 -2.401 + + 3 -27.340 60.9 4.53 0.015
## 4 -3.476 + + 3 -27.363 61.0 4.58 0.015
## 14 -2.489 + -0.41190 + 4 -26.323 61.0 4.66 0.014
## 8 -3.805 + + -0.61440 4 -26.367 61.1 4.75 0.014
## 27 -2.015 + + + 4 -26.690 61.8 5.40 0.010
## 31 -1.894 + 0.15630 + + 5 -25.620 61.9 5.47 0.010
## 18 -1.224 + + 3 -27.869 62.0 5.59 0.009
## 22 -1.420 + -0.48390 + 4 -26.849 62.1 5.72 0.008
## 12 -2.977 + + + 4 -27.015 62.4 6.05 0.007
## 16 -3.292 + + -0.51930 + 5 -26.001 62.6 6.23 0.007
## 26 -1.070 + + + 4 -27.507 63.4 7.03 0.004
## 20 -1.869 + + + 4 -27.513 63.4 7.04 0.004
## 30 -1.258 + -0.38820 + + 5 -26.476 63.6 7.18 0.004
## 24 -2.237 + + -0.58210 + 5 -26.501 63.6 7.23 0.004
## 28 -1.660 + + + + 5 -27.165 64.9 8.56 0.002
## 32 -2.041 + + -0.50160 + + 6 -26.138 65.1 8.76 0.002
## Models ranked by AICc(x)
write.table(MMI, "clipboard", sep="\t")
MAS <- model.avg(MMI)
summary(MAS)
##
## Call:
## model.avg(object = MMI)
##
## Component model call:
## logistf(formula = INF ~ <32 unique rhs>, data = dbEV_mod1, control =
## logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
##
## Component models:
## df logLik AICc delta weight
## 3 2 -26.13 56.38 0.00 0.15
## (Null) 1 -27.19 56.42 0.04 0.14
## 34 3 -25.78 57.80 1.41 0.07
## 4 2 -26.85 57.82 1.44 0.07
## 2 2 -26.85 57.82 1.44 0.07
## 23 3 -25.82 57.87 1.49 0.07
## 35 3 -26.30 58.85 2.46 0.04
## 5 2 -27.38 58.88 2.49 0.04
## 24 3 -26.52 59.29 2.90 0.03
## 234 4 -25.47 59.34 2.95 0.03
## 1 2 -27.70 59.52 3.13 0.03
## 13 3 -26.70 59.63 3.25 0.03
## 25 3 -27.02 60.28 3.89 0.02
## 345 4 -25.95 60.30 3.92 0.02
## 45 3 -27.04 60.31 3.93 0.02
## 235 4 -25.97 60.34 3.95 0.02
## 14 3 -27.34 60.92 4.53 0.02
## 12 3 -27.36 60.97 4.58 0.01
## 134 4 -26.32 61.05 4.66 0.01
## 123 4 -26.37 61.14 4.75 0.01
## 245 4 -26.69 61.78 5.40 0.01
## 2345 5 -25.62 61.85 5.47 0.01
## 15 3 -27.87 61.98 5.59 0.01
## 135 4 -26.85 62.10 5.72 0.01
## 124 4 -27.01 62.43 6.05 0.01
## 1234 5 -26.00 62.62 6.23 0.01
## 145 4 -27.51 63.42 7.03 0.00
## 125 4 -27.51 63.43 7.04 0.00
## 1345 5 -26.48 63.56 7.18 0.00
## 1235 5 -26.50 63.61 7.23 0.00
## 1245 5 -27.16 64.94 8.56 0.00
## 12345 6 -26.14 65.14 8.76 0.00
##
## Term codes:
## chicken_coop objects PCA_animal_Dim1 roof_covering wall_covering
## 1 2 3 4 5
##
## Model-averaged coefficients:
## (full average)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.39515 0.96830 2.474 0.0134 *
## PCA_animal_Dim1 0.03872 0.30274 0.128 0.8982
## roof_covering1 -0.26244 0.58637 0.448 0.6545
## objects1 0.48258 0.87524 0.551 0.5814
## wall_covering1 -0.32967 0.73992 0.446 0.6559
## chicken_coop1 0.59643 1.42269 0.419 0.6750
##
## (conditional average)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.39515 0.96830 2.474 0.01338 *
## PCA_animal_Dim1 0.07782 0.42562 0.183 0.85492
## roof_covering1 -0.80018 0.78615 1.018 0.30875
## objects1 1.48879 0.93020 1.601 0.10948
## wall_covering1 -1.46508 0.87724 1.670 0.09490 .
## chicken_coop1 3.52803 1.27670 2.763 0.00572 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sw(MAS)
## PCA_animal_Dim1 roof_covering objects wall_covering
## Sum of weights: 0.50 0.33 0.32 0.23
## N containing models: 16 16 16 16
## chicken_coop
## Sum of weights: 0.17
## N containing models: 16
#Coefficients and confidence intervals
ci<-confint(MAS)
# Extracting p-values from the summary of the conditional average model
summary_MAS <- summary(MAS)
pval <- summary_MAS$coefmat.subset[, "Pr(>|z|)"]
# Combining OR, CI, and p-values into a table
table.coef <- cbind(exp(cbind (OR=coef(MAS),ci)),pval)
print(table.coef, digits = 3)
## OR 2.5 % 97.5 % pval
## (Intercept) 0.0912 0.0137 0.608 0.01338
## PCA_animal_Dim1 1.0809 0.4694 2.489 0.85492
## roof_covering1 0.4492 0.0962 2.097 0.30875
## objects1 4.4317 0.7158 27.438 0.10948
## wall_covering1 0.2311 0.0414 1.290 0.09490
## chicken_coop1 34.0568 2.7892 415.844 0.00572
write.table(table.coef, "clipboard", sep="\t")
Of all the models (1a-1c), the model including wall and roof covering instead of house type had a smaller AIC. However, when performing model averaging, only the presence of chicken coop was significantly associated to infestation and there are 10 models within 2 delta AIC including the null model, indicating that the model is not robust and there is still significant variance not solely explained by the presence of chicken coops.
dbEV_mod2 <- select(dbEV, INF, dist_lum, num_S_infest_50m, num_S_infest_100m, dist.infest)
dbEV_mod2 <- dbEV_mod2[complete.cases(dbEV_mod2), ]
fit2<-logistf(data=dbEV_mod2, INF ~ dist_lum + num_S_infest_100m, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
summary(fit2)
## logistf(formula = INF ~ dist_lum + num_S_infest_100m, data = dbEV_mod2,
## control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -3.67981361 0.51352773 -4.795538408 -2.74081376 Inf
## dist_lum 0.01981901 0.01325445 -0.009467053 0.04504547 1.902084
## num_S_infest_100m 1.45624804 0.28186004 0.934998231 2.07127192 33.502979
## p method
## (Intercept) 0.000000000000000 2
## dist_lum 0.167845265056839 2
## num_S_infest_100m 0.000000007115494 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=35.62756 on 2 df, p=0.00000001834739, n=201
## Wald test = 71.67107 on 2 df, p = 0.0000000000000002220446
# Fit a standard logistic regression model for VIF calculation
fit_standard <- glm(INF ~ dist_lum + num_S_infest_50m + num_S_infest_100m + dist.infest,
data = dbEV_mod2,
family = binomial,
na.action = na.fail)
# Compute VIF using the car package
vif_values <- car::vif(fit_standard)
# Display the VIF values
print(vif_values)
## dist_lum num_S_infest_50m num_S_infest_100m dist.infest
## 1.090275 21.346858 17.628302 2.126020
# Extract confidence intervals
ci <- confint(fit2)
# Extract odds ratios (OR)
or <- exp(coef(fit2))
# Extract p-values
p_values <- fit2$prob
# Combine OR, confidence intervals, and p-values into one table
table.coef <- cbind(OR = or, ci, P.value = p_values)
# Print the table with specified number of digits
print(table.coef, format = "f", digits = 3)
## OR Lower 95% Upper 95% P.value
## (Intercept) 0.0252 -4.79554 -2.741 0.00000000000
## dist_lum 1.0200 -0.00947 0.045 0.16784526506
## num_S_infest_100m 4.2898 0.93500 2.071 0.00000000712
# Convert the table to a data frame for better handling
table.coef <- as.data.frame(table.coef)
# Print the formatted table
print(table.coef)
## OR Lower 95% Upper 95% P.value
## (Intercept) 0.02522768 -4.795538408 -2.74081376 0.000000000000000
## dist_lum 1.02001671 -0.009467053 0.04504547 0.167845265056839
## num_S_infest_100m 4.28983400 0.934998231 2.07127192 0.000000007115494
# Optionally, write the table to the clipboard
write.table(table.coef, "clipboard", sep = "\t", col.names = NA, row.names = TRUE)
From the bivariate analyses, we decided to include the distance to street lights, the number of infested houses to the South within 50 and 100m radius and the distance to the nearest infested house. Because of multicollinearity between the last thre variables, we decided to continue only with the number of infested houses within 100m to the South.
3.Combined model
#Modelo espacial
dbEV_mod3 <- select(dbEV, INF, chicken_coop, objects, wall_covering, roof_covering, PCA_animal_Dim1, num_S_infest_100m, dist_lum)
dbEV_mod3 <- dbEV_mod3[complete.cases(dbEV_mod3), ]
fit3<-logistf(data=dbEV_mod3, INF ~ wall_covering + roof_covering + chicken_coop + num_S_infest_100m, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
summary(fit3)
## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop +
## num_S_infest_100m, data = dbEV_mod3, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.fail)
##
## Model fitted by Penalized ML
## Coefficients:
## coef se(coef) lower 0.95 upper 0.95 Chisq
## (Intercept) -2.060508 0.8575915 -4.1574581 -0.4863096 6.927464
## wall_covering1 -1.915892 1.0382381 -4.2376714 0.2286594 3.101711
## roof_covering1 -2.115163 1.2058849 -5.4616362 0.1248173 3.371579
## chicken_coop1 4.221911 1.2874148 1.8215968 7.6463463 12.369084
## num_S_infest_100m 1.771262 0.5764847 0.7117322 3.4275353 11.649284
## p method
## (Intercept) 0.0084881974 2
## wall_covering1 0.0782100603 2
## roof_covering1 0.0663301411 2
## chicken_coop1 0.0004365018 2
## num_S_infest_100m 0.0006422726 2
##
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
##
## Likelihood ratio test=26.10362 on 4 df, p=0.0000301581, n=104
## Wald test = 23.85293 on 4 df, p = 0.00008548336
#Coefficients and confidence intervals
ci<-confint(fit3)
table.coef <- exp(cbind (OR=coef(fit3),ci))
print(table.coef, digits = 3)
## OR Lower 95% Upper 95%
## (Intercept) 0.127 0.01565 0.615
## wall_covering1 0.147 0.01444 1.257
## roof_covering1 0.121 0.00425 1.133
## chicken_coop1 68.164 6.18172 2092.984
## num_S_infest_100m 5.878 2.03752 30.801
write.table(table.coef, "clipboard", sep="\t")
MMI <- dredge(fit3)
## Fixed term is "(Intercept)"
MMI
## Global model call: logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop +
## num_S_infest_100m, data = dbEV_mod3, control = logistf.control(maxit = 10000,
## maxstep = 1), na.action = na.fail)
## ---
## Model selection table
## (Int) chc_cop num_S_inf_100 rof_cvr wll_cvr df logLik AICc delta weight
## 1 -2.429 1 -27.192 56.4 0.00 0.244
## 3 -3.075 0.9529 2 -26.438 57.0 0.57 0.183
## 5 -2.010 + 2 -26.852 57.8 1.40 0.121
## 7 -2.703 1.4540 + 3 -26.085 58.4 1.99 0.090
## 9 -1.224 + 2 -27.380 58.9 2.45 0.072
## 11 -1.804 1.0110 + 3 -26.605 59.5 3.03 0.054
## 2 -2.844 + 2 -27.700 59.5 3.10 0.052
## 4 -3.691 + 1.0960 3 -26.927 60.1 3.67 0.039
## 13 -1.072 + + 3 -27.036 60.3 3.89 0.035
## 15 -1.829 1.4310 + + 4 -26.251 60.9 4.48 0.026
## 6 -2.401 + + 3 -27.340 60.9 4.50 0.026
## 8 -3.382 + 1.7030 + 4 -26.555 61.5 5.09 0.019
## 10 -1.224 + + 3 -27.869 62.0 5.56 0.015
## 12 -1.988 + 1.2850 + 4 -27.077 62.6 6.13 0.011
## 14 -1.070 + + + 4 -27.507 63.4 6.99 0.007
## 16 -2.061 + 1.7710 + + 5 -26.706 64.0 7.60 0.005
## Models ranked by AICc(x)
write.table(MMI, "clipboard", sep="\t")
MAS <- model.avg(MMI)
summary(MAS)
##
## Call:
## model.avg(object = MMI)
##
## Component model call:
## logistf(formula = INF ~ <16 unique rhs>, data = dbEV_mod3, control =
## logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
##
## Component models:
## df logLik AICc delta weight
## (Null) 1 -27.19 56.42 0.00 0.24
## 2 2 -26.44 57.00 0.57 0.18
## 3 2 -26.85 57.82 1.40 0.12
## 23 3 -26.08 58.41 1.99 0.09
## 4 2 -27.38 58.88 2.45 0.07
## 24 3 -26.61 59.45 3.03 0.05
## 1 2 -27.70 59.52 3.10 0.05
## 12 3 -26.93 60.09 3.67 0.04
## 34 3 -27.04 60.31 3.89 0.03
## 234 4 -26.25 60.91 4.48 0.03
## 13 3 -27.34 60.92 4.50 0.03
## 123 4 -26.55 61.51 5.09 0.02
## 14 3 -27.87 61.98 5.56 0.02
## 124 4 -27.08 62.56 6.13 0.01
## 134 4 -27.51 63.42 6.99 0.01
## 1234 5 -26.71 64.02 7.60 0.01
##
## Term codes:
## chicken_coop num_S_infest_100m roof_covering wall_covering
## 1 2 3 4
##
## Model-averaged coefficients:
## (full average)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3917 0.8495 2.815 0.00487 **
## num_S_infest_100m 0.4970 0.6518 0.763 0.44573
## roof_covering1 -0.4853 0.9571 0.507 0.61209
## wall_covering1 -0.3302 0.7547 0.438 0.66167
## chicken_coop1 0.5992 1.3749 0.436 0.66296
##
## (conditional average)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3917 0.8495 2.815 0.00487 **
## num_S_infest_100m 1.1608 0.4709 2.465 0.01370 *
## roof_covering1 -1.4699 1.1520 1.276 0.20197
## wall_covering1 -1.4648 0.9296 1.576 0.11508
## chicken_coop1 3.4223 1.0654 3.212 0.00132 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sw(MAS)
## num_S_infest_100m roof_covering wall_covering chicken_coop
## Sum of weights: 0.43 0.33 0.23 0.18
## N containing models: 8 8 8 8
#Coefficients and confidence intervals
ci<-confint(MAS)
# Extracting p-values from the summary of the conditional average model
summary_MAS <- summary(MAS)
pval <- summary_MAS$coefmat.subset[, "Pr(>|z|)"]
# Combining OR, CI, and p-values into a table
table.coef <- cbind(exp(cbind (OR=coef(MAS),ci)),pval)
print(table.coef, digits = 3)
## OR 2.5 % 97.5 % pval
## (Intercept) 0.0915 0.0173 0.483 0.00487
## num_S_infest_100m 3.1924 1.2685 8.034 0.01370
## roof_covering1 0.2299 0.0240 2.199 0.20197
## wall_covering1 0.2311 0.0374 1.429 0.11508
## chicken_coop1 30.6403 3.7970 247.254 0.00132
write.table(table.coef, "clipboard", sep="\t")