The urban Triatoma infestans challenge: integrative insights for vector control and Chagas prevention policies in San Juan, Argentina

Introduction

In this guide, we will walk through the analysis of a dataset from Rivadavia.

Loading the Dataset: Initial step to load and summarize the dataset.
Data Preprocessing: Handling missing values, converting variables, and preparing the data for analysis.
Subsetting the Data: Focusing on specific cases within the dataset.
Visualization: Creating stacked bar plots to understand categorical variables.
Cross-Tabulations: Exploring relationships between categorical variables.
Logistic Regression: Fitting a model to predict the outcome variable and interpreting the results.
Visualization of Predictions: Understanding the model’s predictions through visualization.
Conclusion: Summarizing the entire analysis and highlighting key insights.

This R Markdown file will generate a comprehensive HTML report that details each step of your analysis, including discussions that explain the purpose and findings of each chunk of code. If you have further requirements or specific points you’d like to focus on, feel free to ask!

Load data and packages

# Clear the workspace
rm(list=ls())

#load packages needed
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.5

library(gridExtra)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.0.5

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

## Warning: package 'knitr' was built under R version 4.0.5

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.0.4

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(FactoMineR)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(corrplot)

## corrplot 0.92 loaded

library(missMDA)
library(cluster)

## Warning: package 'cluster' was built under R version 4.0.5

library(vcd)

## Loading required package: grid

library(logistf)
library(MuMIn)

## Warning: package 'MuMIn' was built under R version 4.0.5

## Registered S3 method overwritten by 'MuMIn':
##   method       from   
##   nobs.logistf logistf

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 4.0.5

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.0.5

## Loading required package: survival

## Warning: package 'survival' was built under R version 4.0.5

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

# Disable scientific notation globally
options(scipen = 999)

# Load the dataset
db <- read.csv(here::here("dbrivadavia.csv"))

Pre-process data

# Remove rows with missing values(NA) in the first column
db <- subset(db, !is.na(db[,1]))

# List of categorical variables
cat_vars <- c("ID",
              "Block",
              "Unit",
              "STATUS",
              "INF",
              "SITE",
              "SPR",
              "wall_material",
              "wall_covering",
              "roof_material",
              "roof_covering",
              "watertank",
              "watertank_pigeon",
              "palm",
              "objects",
              "insecticide",
              "commercial",
              "permethrin",
              "other",
              "travel_rural",
              "dwelling_status",
              "chicken_coop",
              "rats",
              "idcerca_NE",
              "idcerca_NW",
              "idcerca_S")

# Convert categorical variables to factors
db[cat_vars] <- lapply(db[cat_vars], as.factor)

# Convert the Date column from character to Date
db$DATE <- as.Date(db$DATE, format = "%d/%m/%Y")
db$SPR.DATE <- as.Date(db$SPR.DATE, format = "%d/%m/%Y")

#subset database to select only houses that have been inspected for triatomine infestations (EV)

dbEV <- subset(db,db$STATUS == "EV")

Descriptive analyses

First, we visualize the categorical explanatory variables againts the infested status of the household (INF)

# Function to create stacked bar plots and ignore NAs
create_stacked_bar_plot <- function(data, categorical_var, target_var) {
  # Remove rows with NA in the current categorical variable
  data <- data[!is.na(data[[categorical_var]]), ]
  
  ggplot(data, aes_string(x = target_var, fill = categorical_var)) +
    geom_bar(position = "fill") +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(y = NULL, fill = categorical_var, x = "Infestation") +
    theme_minimal() +
    theme(legend.position = "none",
              plot.title = element_text(size = 8),
              axis.title.x = element_text(size = 8),
              axis.title.y = element_text(size = 8),
              axis.text.x = element_text(size = 7),
              axis.text.y = element_text(size = 7)
            ) +
    ggtitle(categorical_var)
}

# Create list of categorical variables to visualize
categorical_vars <- setdiff(cat_vars, c(
                            "ID",
                            "Block",
                            "Unit",
                            "STATUS",
                            "INF",
                            "SITE",
                            "SPR",
                            "education_level",
                            "employer",
                            "employee",
                            "selfemployed",
                            "housework",
                            "not.employed",
                            "pension",
                            "benefit",
                            "health",
                            "idcerca_NE",
                            "idcerca_NW",
                            "idcerca_S"))

# Create a list to store the plots
cat_plots <- list()

# Convert INF variable to a factor with labels "NO" and "YES"
dbEV$INF_label <- factor(dbEV$INF, levels = c(0, 1), labels = c("NO", "YES"))

# Create plots for each remaining categorical variable and store in the list
for (var in categorical_vars) {
  cat_plots[[var]] <- create_stacked_bar_plot(dbEV, var, "INF_label")
}

# Arrange the plots in a 4x4 grid
grid.arrange(grobs = cat_plots, ncol = 4, nrow = 4)

# Initialize an empty data frame to store results
contingency_results <- data.frame(Variable = character(),
                                  Category = character(),
                                  No_Infestation = numeric(),
                                  Yes_Infestation = numeric(),
                                  stringsAsFactors = FALSE)

# Perform Fisher's exact test and calculate proportions for each categorical variable
for (var in categorical_vars) {
  # Create a contingency table
  contingency_table <- table(dbEV[[var]], dbEV$INF)
  
  # Calculate proportions
  prop_table <- prop.table(contingency_table, margin = 2)
  
  # Store results with proportions
  for (level in rownames(contingency_table)) {
    contingency_results <- rbind(contingency_results, data.frame(
      Variable = var,
      Category = level,
      No_Infestation = prop_table[level, "0"],
      Yes_Infestation = prop_table[level, "1"]
    ))
  }
}

# Format and display the results as a table
contingency_results %>%
  mutate(No_Infestation = scales::percent(No_Infestation, accuracy = 0.1),
         Yes_Infestation = scales::percent(Yes_Infestation, accuracy = 0.1)) %>%
  kable("html", col.names = c("Variable", "Category", "No Infestation", "Yes Infestation"), align = "c") %>%
  kable_styling(full_width = FALSE, position = "left", 
                bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, color = "black") %>%
  column_spec(2, color = "black") %>%
  column_spec(3, color = "black") %>%
  column_spec(4, color = "black") %>%
  row_spec(0, bold = TRUE, color = "black")

Variable	Category	No Infestation	Yes Infestation
wall_material	1	21.1%	47.1%
wall_material	2	61.2%	47.1%
wall_material	3	11.6%	5.9%
wall_material	4	6.1%	0.0%
wall_covering	0	8.9%	30.8%
wall_covering	1	91.1%	69.2%
roof_material	1	25.0%	17.6%
roof_material	2	55.6%	76.5%
roof_material	3	8.3%	5.9%
roof_material	4	11.1%	0.0%
roof_covering	0	50.0%	80.0%
roof_covering	1	50.0%	20.0%
watertank	0	11.7%	16.7%
watertank	1	88.3%	83.3%
watertank_pigeon	0	78.1%	100.0%
watertank_pigeon	1	21.9%	0.0%
palm	0	91.4%	100.0%
palm	1	8.6%	0.0%
objects	0	42.3%	12.5%
objects	1	57.7%	87.5%
insecticide	0	20.4%	41.2%
insecticide	1	79.6%	58.8%
commercial	0	53.1%	50.0%
commercial	1	46.9%	50.0%
permethrin	0	70.8%	100.0%
permethrin	1	29.2%	0.0%
other	0	64.6%	50.0%
other	1	35.4%	50.0%
travel_rural	0	53.1%	66.7%
travel_rural	1	46.9%	33.3%
dwelling_status	1	68.8%	93.3%
dwelling_status	2	22.1%	0.0%
dwelling_status	3	9.1%	6.7%
chicken_coop	0	98.1%	72.2%
chicken_coop	1	1.9%	27.8%
rats	0	45.2%	38.9%
rats	1	54.8%	61.1%

We perform Fisher’s exact test for each categorical variable to check for significant associations with the INF variable.

# Initialize an empty data frame to store results
fisher_results <- data.frame(Variable = character(),
                             p_value = numeric(),
                             stringsAsFactors = FALSE)

# Perform Fisher's exact test for each categorical variable
for (var in categorical_vars) {
  test_result <- fisher.test(table(dbEV[[var]], dbEV$INF))
  fisher_results <- rbind(fisher_results, data.frame(Variable = var, p_value = test_result$p.value))
}

# Display the results
fisher_results <- fisher_results %>%
  mutate(Significance = case_when(
    p_value < 0.001 ~ "***",
    p_value < 0.01 ~ "**",
    p_value < 0.05 ~ "*",
    TRUE ~ ""
  ))

# Format and display the results as a table
fisher_results %>%
  mutate(p_value = formatC(p_value, format = "f", digits = 4)) %>%
  kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
  kable_styling(full_width = FALSE, position = "left", 
                bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, color = "black") %>%
  column_spec(2, color = "black") %>%
  column_spec(3, bold = TRUE, color = "red") %>%
   row_spec(which(fisher_results$Significance != ""), bold = TRUE) %>%
  row_spec(0, bold = TRUE, color = "black")

Variable	P-value	Significance
wall_material	0.1474
wall_covering	0.0372
roof_material	0.3866
roof_covering	0.0498
watertank	0.4649
watertank_pigeon	0.0256
palm	0.3651
objects	0.0288
insecticide	0.0660
commercial	1.0000
permethrin	1.0000
other	1.0000
travel_rural	0.3248
dwelling_status	0.0893
chicken_coop	0.0003	***
rats	0.8029

Next, we visualize the numeric explanatory variables against the infested status of the household (INF)

# Function to create violin plots with jittered points
create_violin_plot <- function(data, numeric_var, target_var) {
  # Remove rows with NA in the current numeric variable
  data <- data[!is.na(data[[numeric_var]]), ]
  
  ggplot(data, aes_string(x = target_var, y = numeric_var, fill = target_var)) +
    geom_violin(trim = FALSE) +
    geom_jitter(width = 0.2, size = 0.6, alpha = 0.6) +
    labs(y = numeric_var, fill = target_var, x = target_var) +
    theme_minimal() +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 8),
      axis.title.x = element_text(size = 8),
      axis.title.y = element_text(size = 8),
      axis.text.x = element_text(size = 7),
      axis.text.y = element_text(size = 7)
    ) +
    ggtitle(numeric_var)
}

# Create list of numeric variables to visualize
numeric_var <- c("dog_num",
                 "cat_num",
                 "chicken_num",
                 "other_num",
                 "dist_lum",
                 "num_lum_50m",
                 "num_lum_100m",
                 "dist.infest",
                 "dist_NW",
                 "dist_S",
                 "num_infest_50m",
                 "num_infest_100m",
                 "num_S",
                 "num_S_infest_50m",
                 "num_S_infest_100m",
                 "num_NW",
                 "num_NW_infest_50m",
                 "num_NW_infest_100m",
                 "percent_green_50",
                 "percent_green_100")


# Create a list to store the plots
num_plots <- list()

# Create plots for each numeric variable and store in the list
for (var in numeric_var) {
  num_plots[[var]] <- create_violin_plot(dbEV, var, "INF")
}

# Arrange the plots in a 4x4 grid
grid.arrange(grobs = num_plots, ncol = 5, nrow = 4)

We perform non-parametric Wilcox test for each numerical variable to check for significant associations with the INF variable.

# Initialize an empty data frame to store results
results <- data.frame(Variable = character(),
                      p_value = numeric(),
                      stringsAsFactors = FALSE)

# Perform Mann-Whitney test for each numeric variable
for (var in numeric_var) {
  test_result <- wilcox.test(dbEV[[var]] ~ dbEV$INF, data = dbEV)
  results <- rbind(results, data.frame(Variable = var, p_value = test_result$p.value))
}

# Display the results
results <- results %>%
  mutate(Significance = case_when(
    p_value < 0.001 ~ "***",
    p_value < 0.01 ~ "**",
    p_value < 0.05 ~ "*",
    TRUE ~ ""
  ))

# Format p-values to four decimal places
results$p_value <- formatC(results$p_value, format = "f", digits = 4)

# Format the table for Word document
results %>%
  kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
  kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, color = "black") %>%
  column_spec(2, color = "black") %>%
  column_spec(3, bold = TRUE, color = "red") %>%
  row_spec(which(results$Significance != ""), bold = TRUE) %>%
  row_spec(0, bold = TRUE, color = "black")

Variable	P-value	Significance
dog_num	0.2355
cat_num	0.4886
chicken_num	0.0000	***
other_num	0.0952
dist_lum	0.0410
num_lum_50m	0.7524
num_lum_100m	0.0203
dist.infest	0.0460
dist_NW	0.5073
dist_S	0.7514
num_infest_50m	0.0000	***
num_infest_100m	0.0000	***
num_S	0.8242
num_S_infest_50m	0.0000	***
num_S_infest_100m	0.0000	***
num_NW	0.8073
num_NW_infest_50m	0.5132
num_NW_infest_100m	0.3501
percent_green_50	0.2281
percent_green_100	0.2740

Animal host variables.

We first explore the need to create new summary variable for animal hosts. As a first step, we did a correlation matrix with the number of animal hosts.

  # Extract numeric variables related to animals
  animal_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
  
  # Compute the Spearman correlation matrix with significance
  library(Hmisc)
  cor_res <- rcorr(as.matrix(dbEV[animal_vars]), type = "spearman")
  
  # Extract the correlation coefficients and p-values
  cor_matrix <- cor_res$r
  p_matrix <- cor_res$P
  
  # Set the diagonal elements of both matrices to NA
  diag(cor_matrix) <- NA
  diag(p_matrix) <- NA
  
  # Format the matrices with three decimal places and no scientific notation
  formatted_cor_matrix <- formatC(cor_matrix, format = "f", digits = 3)
  formatted_p_matrix <- formatC(p_matrix, format = "f", digits = 3)
  
    # Convert matrices to data frames for better kable formatting
  cor_df <- as.data.frame(formatted_cor_matrix)
  p_df <- as.data.frame(formatted_p_matrix)
  
  # Create row and column names for clarity
  rownames(cor_df) <- colnames(cor_df) <- animal_vars
  rownames(p_df) <- colnames(p_df) <- animal_vars
  
  # Print the correlation matrix using kable
  kable(cor_df, caption = "Spearman Correlation Matrix", align = "c") %>%
    kable_styling(full_width = FALSE, position = "left", 
                  bootstrap_options = c("striped", "hover", "condensed")) %>%
        column_spec(1:ncol(cor_df), color = "black") %>%  
    row_spec(0, color = "black", bold = TRUE)

Spearman Correlation Matrix
	dog_num	cat_num	chicken_num	other_num
dog_num	NA	0.363	0.193	0.180
cat_num	0.363	NA	0.008	0.015
chicken_num	0.193	0.008	NA	0.227
other_num	0.180	0.015	0.227	NA

  # Print the p-value matrix using kable
  kable(p_df, caption = "P-Value Matrix for Spearman Correlations", align = "c") %>%
    kable_styling(full_width = FALSE, position = "left", 
                  bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(1:ncol(cor_df), color = "black") %>%  
    row_spec(0, color = "black", bold = TRUE)

P-Value Matrix for Spearman Correlations
	dog_num	cat_num	chicken_num	other_num
dog_num	NA	0.000	0.012	0.021
cat_num	0.000	NA	0.915	0.847
chicken_num	0.012	0.915	NA	0.003
other_num	0.021	0.847	0.003	NA

  # Visualize the correlation matrix without the diagonal elements
  corrplot(cor_matrix, method = "circle", na.label = " ")

Next we did a factor analysis of mixed data (FAMD) to include numeric and catergorical variables (presence of rats). We didn’t include presence of pigeons in water tanks since none of the infested houses had them.

# Extract the relevant variables, including categorical variables
  famd_vars <- c("dog_num", "cat_num", "chicken_num", "other_num", "rats")
  
  # Ensure the categorical variables are factors
  dbEV[famd_vars] <- lapply(dbEV[famd_vars], function(x) if(is.character(x)) as.factor(x) else x)
  
  # Impute missing values using imputeFAMD
  imputed_data <- imputeFAMD(dbEV[famd_vars], ncp = 4) #ncp is the number of dimensions to use for imputation
  
  # Perform FAMD on the imputed dataset
  famd_res <- FAMD(imputed_data$completeObs, ncp = 4, graph = FALSE)
  
  # Print the summary of FAMD results
  summary(famd_res)

## 
## Call:
## FAMD(base = imputed_data$completeObs, ncp = 4, graph = FALSE) 
## 
## 
## Eigenvalues
##                       Dim.1  Dim.2  Dim.3  Dim.4
## Variance              1.534  1.052  0.954  0.785
## % of var.            30.685 21.032 19.078 15.690
## Cumulative % of var. 30.685 51.718 70.795 86.485
## 
## Individuals (the 10 first)
##                 Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 3           |  1.201 |  0.397  0.051  0.109 |  0.043  0.001  0.001 | -0.072
## 6           |  1.589 |  0.944  0.289  0.353 |  0.030  0.000  0.000 | -0.139
## 7           |  1.498 | -1.469  0.699  0.961 |  0.087  0.004  0.003 |  0.101
## 8           |  1.450 | -1.395  0.631  0.925 | -0.006  0.000  0.000 |  0.089
## 9           |  1.498 | -1.469  0.699  0.961 |  0.087  0.004  0.003 |  0.101
## 10          |  1.450 | -1.395  0.631  0.925 | -0.006  0.000  0.000 |  0.089
## 11          |  1.498 | -1.469  0.699  0.961 |  0.087  0.004  0.003 |  0.101
## 24          |  1.474 | -0.114  0.004  0.006 | -0.192  0.018  0.017 | -0.019
## 25          |  1.487 | -1.409  0.644  0.898 |  0.189  0.017  0.016 |  0.288
## 26          |  1.201 |  0.397  0.051  0.109 |  0.043  0.001  0.001 | -0.072
##                ctr   cos2  
## 3            0.003  0.004 |
## 6            0.010  0.008 |
## 7            0.005  0.005 |
## 8            0.004  0.004 |
## 9            0.005  0.005 |
## 10           0.004  0.004 |
## 11           0.005  0.005 |
## 24           0.000  0.000 |
## 25           0.043  0.037 |
## 26           0.003  0.004 |
## 
## Continuous variables
##                Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr
## dog_num     |  0.757 37.339  0.573 | -0.015  0.022  0.000 | -0.073  0.552
## cat_num     |  0.537 18.775  0.288 | -0.562 29.983  0.315 | -0.069  0.498
## chicken_num |  0.242  3.814  0.059 |  0.727 50.205  0.528 | -0.555 32.316
## other_num   |  0.322  6.764  0.104 |  0.456 19.773  0.208 |  0.796 66.446
##               cos2  
## dog_num      0.005 |
## cat_num      0.005 |
## chicken_num  0.308 |
## other_num    0.634 |
## 
## Categories
##                 Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2  v.test
## 0           |  -0.847  15.909   0.685 -10.110 |   0.014   0.009   0.000   0.195
## 1           |   0.926  17.400   0.685  10.110 |  -0.015   0.009   0.000  -0.195
##                 Dim.3     ctr    cos2  v.test  
## 0           |   0.040   0.090   0.001   0.598 |
## 1           |  -0.043   0.098   0.001  -0.598 |

Visualize the FAMD results

  # Visualize the FAMD results in the first 2 dimensions
    fviz_famd_var(famd_res, repel = TRUE) + 
    labs(title = "Variables factor map")

  # Visualize the contributions of quantitative variables
  fviz_famd_var(famd_res, choice = "quanti.var", repel = TRUE) + 
    labs(title = "Quantitative Variables factor map")

  # Modify INF labels to be more descriptive
  dbEV$INF_label <- factor(dbEV$INF, levels = c(0, 1), labels = c("Not Infested", "Infested"))

  # Visualize the FAMD results with individuals grouped by infestation status

  fviz_famd(famd_res, 
               habillage = dbEV$INF_label,        # Use descriptive labels for coloring
               palette = c("blue", "red"),        # Custom colors: "Not Infested" (blue), "Infested" (red)
               repel = TRUE,                      # Avoid overlapping text labels
               addEllipses = TRUE,                # Add confidence ellipses
               label = "none") +                  # Disable individual labels
  labs(title = "FAMD - Individuals grouped by infestation", 
       color = "Infestation Status", fill = "Infestation Status") +  # Set a single legend title
  theme(legend.position = "right")

Explore the use of PCA for the animal hosts using the numeric variables only

  # Extract the quantitative variables for PCA
  quant_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
  quant_data <- dbEV[quant_vars]
  
  # Ensure the data does not contain NA values
  quant_data <- na.omit(quant_data)
  
  # Perform PCA
  pca_res <- prcomp(quant_data, scale. = TRUE)
  
  # Print summary of PCA results
  summary(pca_res)

## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.1253 1.0276 0.9770 0.8505
## Proportion of Variance 0.3166 0.2640 0.2386 0.1808
## Cumulative Proportion  0.3166 0.5806 0.8192 1.0000

  # Extract the quantitative variables for PCA
quant_vars <- c("dog_num", "cat_num", "chicken_num", "other_num")
quant_data <- dbEV[quant_vars]

# Impute missing values using missMDA's PCA-based method
# Estimate the number of dimensions to keep for imputation
ncp <- estim_ncpPCA(quant_data, scale = TRUE)$ncp

# Perform the imputation
imputed_data <- imputePCA(quant_data, ncp = ncp, scale = TRUE)$completeObs

# Perform PCA on the imputed data
pca_res <- prcomp(imputed_data, scale. = TRUE)

# Print summary of PCA results
summary(pca_res)

## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.1238 1.0262 0.9770 0.8541
## Proportion of Variance 0.3157 0.2633 0.2386 0.1824
## Cumulative Proportion  0.3157 0.5790 0.8176 1.0000

# Extract PCA coordinates (Principal Components)
pca_coords <- as.data.frame(pca_res$x)

# Rename the columns for clarity (e.g., PCA_Dim1, PCA_Dim2, etc.)
colnames(pca_coords) <- paste0("PCA_animal_Dim", 1:ncol(pca_coords))

# Add PCA dimensions back to the original dataset
dbEV <- cbind(dbEV, pca_coords)

Visualize the PCA results and evaluate association with infestation

  # Plot variables
  fviz_pca_var(pca_res, repel = TRUE) +
    labs(title = "PCA - Variables")

  # Plot individuals with custom colors and descriptive labels in the legend
  fviz_pca_ind(pca_res, 
                    geom.ind = "point",               # Plot individuals as points
                    habillage = dbEV$INF_label,       # Use descriptive labels for coloring
                    addEllipses = TRUE,               # Add confidence ellipses
                    palette = c("blue", "red"),       # Custom colors: "Not Infested" (blue), "Infested" (red)
                    legend.title = "Infestation Status", # Set a single legend title
                    label = "none") +                 # Disable individual labels
    scale_color_manual(values = c("Not Infested" = "blue", "Infested" = "red")) +  # Ensure consistent color mapping
    theme(legend.position = "right")                # Position legend on the right

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

  # Define all variables for Mann-Whitney U tests
  all_vars <- c(colnames(pca_coords))
  # Remove variables containing "NA"
  all_vars <- subset(all_vars, !is.na(all_vars))
  
  # Initialize an empty data frame to store results
  results <- data.frame(Variable = character(),
                        p_value = numeric(),
                        Significance = character(),
                        stringsAsFactors = FALSE)
  
  # Perform Mann-Whitney test for each variable
  for (var in all_vars) {
    test_result <- wilcox.test(dbEV[[var]] ~ dbEV$INF, data = dbEV)
    results <- rbind(results, data.frame(Variable = var, 
                                         p_value = formatC(test_result$p.value, format = "f", digits = 4), 
                                         Significance = ifelse(test_result$p.value < 0.05, ifelse(test_result$p.value < 0.01, ifelse(test_result$p.value < 0.001, "***", "**"), "*"), "")))
  }
  
  # Format p-values to four decimal places
results$p_value <- formatC(results$p_value, format = "f", digits = 4)

# Format the table for Word document
results %>%
  kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
  kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, color = "black") %>%
  column_spec(2, color = "black") %>%
  column_spec(3, bold = TRUE, color = "red") %>%
  row_spec(which(results$Significance != ""), bold = TRUE) %>%
  row_spec(0, bold = TRUE, color = "black")

Variable	P-value	Significance
PCA_animal_Dim1	0.0233
PCA_animal_Dim2	0.3070
PCA_animal_Dim3	0.0159
PCA_animal_Dim4	0.0591

The PCA is slightly better (explains more variance so we will proceed that). The first two dimensions of the PCA that explain 31.6% and 26.3%, respectively. The first dimension separates the households with a larger number of animal hosts and the second dimension distinguishes those in particular that have a larger number of chickens and other animals, not just companion animals (dogs and cats). The third dimension is associated with the number of chicken. Since we have 4 dimensions in total (given that we have 4 variables), the first two dimensions explain ~50%, and there is a significant correlation between the numbers of dogs and cats, the animal host variables selected for the infestation models were the number of dogs and the number of chickens. However, we will also try the PCA dim 1 in the model as an alternative to the total number of animal hosts.

House construction variables

We used cluster analyses to build “house type” variable based on the construction materials (roof and wall materials and characteristics). We first did a multiple correspondence analyses (MCA), to sumarize the categorical variables in orthogonal numeric variables to be used in the cluster analyses.

  # Define the variables related to house type
  house_vars <- c("wall_material", "wall_covering", "roof_material", "roof_covering")
  
  # Ensure the variables are treated as factors
  dbEV[house_vars] <- lapply(dbEV[house_vars], as.factor)
  
  # Impute missing values
  imputed_data <- imputeMCA(dbEV[house_vars], ncp = 3)
  
  # Perform Multiple Correspondence Analysis (MCA) on imputed data
  mca_res <- MCA(imputed_data$completeObs, graph = TRUE)

  # Extract MCA coordinates for individuals
  mca_coords <- as.data.frame(mca_res$ind$coord)
  colnames(mca_coords) <- paste0("MCA_Dim", 1:ncol(mca_coords))
  
  # Combine the original data frame with the MCA results
  dbEV <- cbind(dbEV, mca_coords)
  
  # Create a scree plot to visualize the variance explained by each dimension
fviz_screeplot(mca_res, addlabels = TRUE, ylim = c(0, 100)) +
  labs(title = "Scree Plot - Variance Explained by Each MCA Dimension", 
       x = "Dimensions", y = "Percentage of Variance Explained")

We then performed a hierarchical clustering analyses with the first 5 dimensions (~77% of the variance)

  # Perform hierarchical clustering
  diss_matrix <- daisy(mca_coords, metric = "euclidean")
  hc_res <- hclust(diss_matrix, method = "ward.D2")
  
  # Plot the dendrogram
  plot(hc_res, labels = FALSE, main = "Hierarchical Clustering Dendrogram")

  # Cut the dendrogram to obtain 3 clusters
  clusters <- cutree(hc_res, k = 3)
  
  # Create a table summarizing the clusters
  cluster_table <- data.frame(Cluster = clusters, dbEV[house_vars])
  print("Cluster summary table:")

## [1] "Cluster summary table:"

  print(table(cluster_table$Cluster))

## 
##   1   2   3 
## 128  40  33

  # Add the cluster assignments as a new categorical variable to dbEV
  dbEV$house_type <- as.factor(clusters)

Next, we visualized the clusters (type of house) based on the MCA variables, and charectized the clusters based on the construction characteristics of the houses.

  # Plot the individuals colored by house_type
  fviz_mca_ind(mca_res, 
               label = "none",         # Hide individual labels
               habillage = dbEV$house_type,   # Color by house_type
               addEllipses = TRUE,     # Add confidence ellipses
               ellipse.level = 0.95,   # Confidence level
               palette = "jco")        # Color palette

  # Plot the variables
  fviz_mca_var(mca_res, 
               repel = TRUE,           # Avoid text overlapping
               col.var = "black")      # Color of variables

  # Plot a biplot showing both individuals and variables colored by house_type
  fviz_mca_biplot(mca_res, 
                  repel = TRUE,        # Avoid text overlapping
                  col.var = "black",   # Color of variables
                  habillage = dbEV$house_type, # Color by house_type
                  palette = "jco")     # Color palette

## Warning: ggrepel: 162 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

  # Create stacked bar plots for each house variable without NAs
  plot_list <- lapply(house_vars, function(var) {
    ggplot(na.omit(dbEV[, c(var, "house_type")]), aes_string(x = var, fill = "house_type")) +
      geom_bar(position = "fill") +
      labs(y = "Proportion", fill = "House Type") +
      theme_minimal() +
      ggtitle(paste("Distribution of", var))
  })
  
  # Arrange the plots in a grid
  do.call(grid.arrange, c(plot_list, ncol = 2))

  # Create stacked bar plots for each house variable with house_type on the x-axis and without NAs
  plot_list <- lapply(house_vars, function(var) {
    ggplot(na.omit(dbEV[, c(var, "house_type")]), aes_string(x = "house_type", fill = var)) +
      geom_bar(position = "fill") +
      labs(y = "Proportion", fill = var) +
      theme_minimal() +
      ggtitle(paste("Distribution of", var, "by House Type"))
  })
  
  # Arrange the plots in a grid
  do.call(grid.arrange, c(plot_list, ncol = 2))

Hacer: aca escribir las caracteristicas de las casas y caracterizar los clusters

Socio-demographic variables

# Define the additional variables for MCA
  additional_vars <- c("education_level", "pension", "benefit", "health")
  
  # Recode health category 4 as 3, and specify .default to handle other values
  dbEV$health <- recode(as.numeric(dbEV$health), `4` = 2, `3` = 2, .default = as.numeric(dbEV$health))
  
  # Recode education_level categories 4 and 5 as 3, and specify .default to handle other values
  dbEV$education_level <- recode(as.numeric(dbEV$education_level), `4` = 3, `5` = 3, .default = as.numeric(dbEV$education_level))

  # Create the income_work variable
  dbEV$income_work <- with(dbEV, ifelse(employer == 1 | employee == 1 | selfemployed == 1, "income_work", 
                                        ifelse(housework == 1 | not.employed == 1, "not_income_work", NA)))
  
  # Ensure the variables are treated as factors
  dbEV[additional_vars] <- lapply(dbEV[additional_vars], as.factor)
  dbEV$income_work <- as.factor(dbEV$income_work)
  
  # Create frequency tables for each variable
health_table <- as.data.frame(table(dbEV$health))
education_table <- as.data.frame(table(dbEV$education_level))
benefit_table <- as.data.frame(table(dbEV$benefit))
pension_table <- as.data.frame(table(dbEV$pension))
income_work_table <- as.data.frame(table(dbEV$income_work))

# Rename columns for clarity
colnames(health_table) <- c("Category", "Health_Freq")
colnames(education_table) <- c("Category", "Education_Freq")
colnames(benefit_table) <- c("Category", "Benefit_Freq")
colnames(pension_table) <- c("Category", "Pension_Freq")
colnames(income_work_table) <- c("Category", "Income_Work_Freq")

# Merge the tables by "Category" with all data included (full join)
combined_table <- Reduce(function(x, y) merge(x, y, by = "Category", all = TRUE),
                         list(health_table, education_table, benefit_table, pension_table, income_work_table))

# Display the combined table using kable
kable(combined_table, caption = "Frequency Tables for Health, Education Level, Benefit, Pension, and Income Work", align = "c") %>%
  kable_styling(full_width = FALSE, position = "left", 
                bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE, color = "black") %>%    
  column_spec(1:ncol(combined_table), color = "black")

Frequency Tables for Health, Education Level, Benefit, Pension, and Income Work
Category	Health_Freq	Education_Freq	Benefit_Freq	Pension_Freq	Income_Work_Freq
1	45	37	27	82	NA
2	112	78	NA	NA	NA
3	NA	51	NA	NA	NA
0	NA	NA	108	76	NA
income_work	NA	NA	NA	NA	124
not_income_work	NA	NA	NA	NA	35

Next, we did an MCA with the categorical variables to summarize the socio-demographic variables

  # Combine income_work with additional variables for MCA
  mca_vars <- c(additional_vars, "income_work")
  
  # Check for missing values in the combined data
  missing_values <- sapply(dbEV[mca_vars], function(x) sum(is.na(x)))
  print("Missing values in each variable:")

## [1] "Missing values in each variable:"

  print(missing_values)

## education_level         pension         benefit          health     income_work 
##              35              43              66              44              42

  # Impute missing values
  imputed_data <- imputeMCA(dbEV[mca_vars], ncp = 3)
  
  # Perform Multiple Correspondence Analysis (MCA) on imputed data
  mca_res <- MCA(imputed_data$completeObs, graph = TRUE)

 # Extract MCA coordinates for individuals
  mca_coords <- as.data.frame(mca_res$ind$coord)
  colnames(mca_coords) <- paste0("MCA_SEE_Dim", 1:ncol(mca_coords))
  
  # Combine the original data frame with the MCA results
  dbEV <- cbind(dbEV, mca_coords)

We then visualized the results of the MCA in more detail.

  # Visualize the contribution of variables to the first dimension
  fviz_contrib(mca_res, choice = "var", axes = 1, top = 10)

  # Visualize the contribution of variables to the second dimension
  fviz_contrib(mca_res, choice = "var", axes = 2, top = 10)

  # Visualize the biplot of the MCA results
  fviz_mca_biplot(mca_res, 
                  repel = TRUE,        # Avoid text overlapping
                  col.var = "black",   # Color of variables
                  habillage = "none", # Color by income_work
                  palette = "jco",     # Color palette
                  addEllipses = FALSE,  # Add confidence ellipses
                  ellipse.level = 0.95) # Confidence level

## Warning: ggrepel: 146 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

  # Get the variance explained by each dimension
  eigenvalues <- mca_res$eig
  print("Eigenvalues and variance explained by each dimension:")

## [1] "Eigenvalues and variance explained by each dimension:"

  print(eigenvalues)

##       eigenvalue percentage of variance cumulative percentage of variance
## dim 1  0.2983585               24.86321                          24.86321
## dim 2  0.2288132               19.06777                          43.93097
## dim 3  0.1981309               16.51090                          60.44188
## dim 4  0.1924189               16.03491                          76.47679
## dim 5  0.1566999               13.05833                          89.53512
## dim 6  0.1255786               10.46488                         100.00000

We then performed a hierarchical cluster of the households based on the first 4 dimensions (~60% of the variance).

  # Extract the first three dimensions of the MCA coordinates for individuals
  mca_coords <- as.data.frame(mca_res$ind$coord[, 1:3])
  colnames(mca_coords) <- paste0("MCA_SEE_Dim", 1:3)
  
  # Perform hierarchical clustering on the first three dimensions
  diss_matrix <- dist(mca_coords)  # Compute dissimilarity matrix
  hc_res <- hclust(diss_matrix, method = "ward.D2")
  
  # Plot the dendrogram
  plot(hc_res, labels = FALSE, main = "Hierarchical Clustering Dendrogram")

  # Cut the dendrogram to obtain clusters (e.g., 3 clusters)
  num_clusters <- 3
  clusters <- cutree(hc_res, k = num_clusters)
  
  # Add the cluster results back to the original data frame
  dbEV$SEE <- as.factor(clusters)
  table(dbEV$SEE)

## 
##   1   2   3 
## 107  60  34

Finally, we visualized the clusters based on the MCA results for interpretation

  # Visualize the biplot of the MCA results with clusters
  fviz_mca_biplot(mca_res, 
                  repel = TRUE,        # Avoid text overlapping
                  col.var = "black",   # Color of variables
                  habillage = dbEV$SEE, # Color by clusters
                  palette = "jco",     # Color palette
                  addEllipses = TRUE,  # Add confidence ellipses
                  ellipse.level = 0.95) # Confidence level

## Warning: ggrepel: 159 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

    # Create stacked bar plots for each house variable without NAs
  plot_list <- lapply(mca_vars, function(var) {
    ggplot(na.omit(dbEV[, c(var, "SEE")]), aes_string(x = var, fill = "SEE")) +
      geom_bar(position = "fill") +
      labs(y = "Proportion", fill = "SEE") +
      theme_minimal() +
      ggtitle(paste("Distribution of", var))
  })
  
  # Arrange the plots in a grid
    do.call(grid.arrange, c(plot_list, ncol = 2))

  # Create stacked bar plots for each house variable with house_type on the x-axis and without NAs
  plot_list <- lapply(mca_vars, function(var) {
    ggplot(na.omit(dbEV[, c(var, "SEE")]), aes_string(x = "SEE", fill = var)) +
      geom_bar(position = "fill") +
      labs(y = "Proportion", fill = var) +
      theme_minimal() +
      ggtitle(paste("Distribution of", var, "by SEE"))
  })
  
  # Arrange the plots in a grid
    do.call(grid.arrange, c(plot_list, ncol = 2))

Hacer: interpretacion de SEE - cambiar SEE por SES

Association between dwelling status and SES

  dbEV$dwelling_status <- as.factor(dbEV$dwelling_status)
  
  # Create a contingency table between the cluster (SEE) and dwelling_status
  contingency_table <- table(SEE = dbEV$SEE, DwellingStatus = dbEV$dwelling_status)
  
  # Print the contingency table with variable titles and margins
  print("Contingency Table:")

## [1] "Contingency Table:"

  contingency_table_with_margins <- addmargins(contingency_table)
  print(ftable(contingency_table_with_margins))

##     DwellingStatus   1   2   3 Sum
## SEE                               
## 1                   62  12   5  79
## 2                   35  19   5  59
## 3                   23   3   5  31
## Sum                120  34  15 169

  # Perform the chi-square test of independence
  chisq_test <- chisq.test(contingency_table)

## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect

  # Print the results of the chi-square test
  print("Chi-square Test Results:")

## [1] "Chi-square Test Results:"

  print(chisq_test)

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 11.142, df = 4, p-value = 0.02501

  # Convert the contingency table to a data frame for ggplot2
  contingency_df <- as.data.frame(contingency_table)
  colnames(contingency_df) <- c("SEE", "DwellingStatus", "Count")
  
  # Create a bar plot of the proportions
  ggplot(contingency_df, aes(x = SEE, y = Count, fill = DwellingStatus)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "Dwelling Status") +
    ggtitle("Proportions of Dwelling Status within SEE Groups") +
    theme_minimal()

# Create a mosaic plot of the contingency table
  mosaic(~SEE + DwellingStatus, data = contingency_table,
         shade = TRUE, legend = TRUE,
         main = "Mosaic Plot of Dwelling Status by SEE")

Association between house type status and SES

  dbEV$house_type <- as.factor(dbEV$house_type)
  
  # Create a contingency table between the cluster (SEE) and dwelling_status
  contingency_table <- table(SEE = dbEV$SEE, HouseType = dbEV$house_type)
  
  # Print the contingency table with variable titles and margins
  print("Contingency Table:")

## [1] "Contingency Table:"

  contingency_table_with_margins <- addmargins(contingency_table)
  print(ftable(contingency_table_with_margins))

##     HouseType   1   2   3 Sum
## SEE                          
## 1              74  14  19 107
## 2              30  20  10  60
## 3              24   6   4  34
## Sum           128  40  33 201

  # Perform the chi-square test of independence
  chisq_test <- chisq.test(contingency_table)
  
  # Print the results of the chi-square test
  print("Chi-square Test Results:")

## [1] "Chi-square Test Results:"

  print(chisq_test)

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 11.115, df = 4, p-value = 0.0253

  # Convert the contingency table to a data frame for ggplot2
  contingency_df <- as.data.frame(contingency_table)
  colnames(contingency_df) <- c("SEE", "HouseType", "Count")
  
  # Create a bar plot of the proportions
  ggplot(contingency_df, aes(x = SEE, y = Count, fill = HouseType)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "HouseType") +
    ggtitle("Proportions of House Type within SEE Groups") +
    theme_minimal()

  # Create a mosaic plot of the contingency table
  mosaic(~SEE + HouseType, data = contingency_table,
         shade = TRUE, legend = TRUE,
         main = "Mosaic Plot of House Type by SEE")

Association between insecticide use and SES

  dbEV$insecticide <- as.factor(dbEV$insecticide)
  
  # Create a contingency table between the cluster (SEE) and dwelling_status
  contingency_table <- table(SEE = dbEV$SEE, Insecticide = dbEV$insecticide)
  
  # Print the contingency table with variable titles and margins
  print("Contingency Table:")

## [1] "Contingency Table:"

  contingency_table_with_margins <- addmargins(contingency_table)
  print(ftable(contingency_table_with_margins))

##     Insecticide   0   1 Sum
## SEE                        
## 1                20  61  81
## 2                11  48  59
## 3                 8  26  34
## Sum              39 135 174

  # Perform the chi-square test of independence
  chisq_test <- chisq.test(contingency_table)
  
  # Print the results of the chi-square test
  print("Chi-square Test Results:")

## [1] "Chi-square Test Results:"

  print(chisq_test)

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 0.74809, df = 2, p-value = 0.6879

  # Convert the contingency table to a data frame for ggplot2
  contingency_df <- as.data.frame(contingency_table)
  colnames(contingency_df) <- c("SEE", "Insecticide", "Count")
  
  # Create a bar plot of the proportions
  ggplot(contingency_df, aes(x = SEE, y = Count, fill = Insecticide)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(y = "Proportion", x = "Socio-Economic Status (SEE)", fill = "Insecticide") +
    ggtitle("Proportions of House Type within SEE Groups") +
    theme_minimal()

  # Create a mosaic plot of the contingency table
  mosaic(~SEE + Insecticide, data = contingency_table,
         shade = TRUE, legend = TRUE,
         main = "Mosaic Plot of Insecticide use by SEE")

Infestation models

Lastly, we conducted 3 models to evaluate infestation status:

Infestation vs. socio-demographic and housing variables.

# Define the additional categorical variables 
  categorical_var <- c("SEE", "house_type")
  
  # Initialize an empty data frame to store results
  results_chisq <- data.frame(Variable = character(),
                              p_value = numeric(),
                              stringsAsFactors = FALSE)
  
  # Perform Chi-square test for each categorical variable
  for (var in categorical_var) {
    # Remove rows with NA values in the specific variables
    valid_data <- dbEV %>% select(all_of(var), INF) %>% na.omit()
    
    # Create contingency table
    contingency_table <- table(valid_data[[var]], valid_data$INF)
    
    # Perform chi-square test
    test_result <- chisq.test(contingency_table)
    
    # Store results
    results_chisq <- rbind(results_chisq, data.frame(Variable = var, p_value = test_result$p.value))
  }

## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect

  # Display the results
  results_chisq <- results_chisq %>%
    mutate(Significance = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ ""
    ))
  
  # Format p-values to four decimal places
  results_chisq$p_value <- formatC(results_chisq$p_value, format = "f", digits = 4)
  
  # Format the table for Word document
  formatted_table_chisq <- results_chisq %>%
    kable("html", col.names = c("Variable", "P-value", "Significance"), align = "c") %>%
    kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(1, bold = TRUE, color = "black") %>%  
    column_spec(2, color = "black") %>%               
    column_spec(3, bold = TRUE, color = "black") %>%  
    row_spec(0, color = "black", bold = TRUE)   
  
  
  # Print the formatted table
  formatted_table_chisq

Variable	P-value	Significance
SEE	0.8551
house_type	0.5294

The SES and house type variables were not associated to infestation status of the house. Next, we ran the model

  #select variables
  dbEV_mod1 <- select(dbEV, INF, SEE, house_type, chicken_coop, objects, chicken_num, dog_num, wall_covering, roof_covering, PCA_animal_Dim1)
  
 #Model 1a: with the number of dogs and the presence of chicken coop (instead of the number of chicken)
  dbEV_mod1$house_type <- relevel(factor(dbEV_mod1$house_type), ref = "2")
  fit1<-logistf(data = dbEV_mod1, INF ~ house_type + chicken_coop + objects + dog_num, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
  summary(fit1)

## logistf(formula = INF ~ house_type + chicken_coop + objects + 
##     dog_num, data = dbEV_mod1, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.omit)
## 
## Model fitted by Penalized ML
## Coefficients:
##                      coef  se(coef) lower 0.95 upper 0.95       Chisq
## (Intercept)   -3.24352962 0.7881712 -5.1730364 -1.8903445 34.09248672
## house_type1    0.24816084 0.7783504 -1.2421891  2.0457512  0.09688231
## house_type3   -0.27110625 0.9818707 -2.3557155  1.8109296  0.07073390
## chicken_coop1  2.30663445 0.8016294  0.7320637  3.9780758  7.96858870
## objects1       0.96062723 0.7314035 -0.4388935  2.7012503  1.74003014
## dog_num        0.02967268 0.1725631 -0.3610981  0.3667029  0.02596800
##                               p method
## (Intercept)   0.000000005255379      2
## house_type1   0.755603554782079      2
## house_type3   0.790271207719516      2
## chicken_coop1 0.004759603338522      2
## objects1      0.187135007376231      2
## dog_num       0.871978488250774      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=13.99973 on 5 df, p=0.01561111, n=158
## Wald test = 64.29003 on 5 df, p = 0.000000000001573075

  #Model 1b: with the PCAdim1 to indicate the number of total animal hosts instead of the number of dogs.  
  fit1b<-logistf(data = dbEV_mod1, INF ~ house_type + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
  summary(fit1b)

## logistf(formula = INF ~ house_type + chicken_coop + objects + 
##     PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.omit)
## 
## Model fitted by Penalized ML
## Coefficients:
##                       coef  se(coef) lower 0.95 upper 0.95       Chisq
## (Intercept)     -3.1734905 0.7881777 -5.1055405 -1.8227946 32.88584105
## house_type1      0.2312874 0.7610383 -1.2115518  2.0057562  0.08812514
## house_type3     -0.3098004 0.9632120 -2.3486030  1.7374288  0.09634008
## chicken_coop1    2.0054308 0.8290798  0.3629840  3.7279144  5.64760519
## objects1         0.9473739 0.7112456 -0.3975415  2.6597209  1.82655983
## PCA_animal_Dim1  0.2015658 0.2078830 -0.2777670  0.6041351  0.76388754
##                                 p method
## (Intercept)     0.000000009773249      2
## house_type1     0.766574323659760      2
## house_type3     0.756266691160554      2
## chicken_coop1   0.017479227293958      2
## objects1        0.176534422222332      2
## PCA_animal_Dim1 0.382114659173373      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=14.9768 on 5 df, p=0.01046193, n=164
## Wald test = 65.68222 on 5 df, p = 0.0000000000008090195

    #Model 1c: this model replaces the house type with wall and roof covering which were signifficant in the bivariate analyses.  
  fit1c<-logistf(data = dbEV_mod1, INF ~ wall_covering + roof_covering + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.omit)
  summary(fit1c)

## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop + 
##     objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.omit)
## 
## Model fitted by Penalized ML
## Coefficients:
##                       coef  se(coef) lower 0.95 upper 0.95     Chisq
## (Intercept)     -2.0414706 1.0269912 -4.6838513 -0.1764189 4.7023509
## wall_covering1  -1.7176185 0.8965726 -3.5838011  0.2544595 2.9861507
## roof_covering1  -0.3327281 0.8327502 -2.2802046  1.4376450 0.1376429
## chicken_coop1    3.8189131 1.3448303  1.3323207  7.1217844 9.5952001
## objects1         1.2457615 0.9313826 -0.5839831  3.6281952 1.7241705
## PCA_animal_Dim1 -0.5015937 0.4549452 -1.7308895  0.3717814 1.1538464
##                           p method
## (Intercept)     0.030121390      2
## wall_covering1  0.083979577      2
## roof_covering1  0.710635645      2
## chicken_coop1   0.001950867      2
## objects1        0.189157097      2
## PCA_animal_Dim1 0.282745410      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=16.87287 on 5 df, p=0.004747313, n=104
## Wald test = 37.29731 on 5 df, p = 0.0000005221187

  # Calculate the AIC for both models
  aic_fit1 <- AIC(fit1)
  aic_fit1b <- AIC(fit1b)
  aic_fit1c <- AIC(fit1c)
  

  # Print the AIC values
  print(paste("AIC for Model 1a (with dog_num):", aic_fit1))

## [1] "AIC for Model 1a (with dog_num): 98.390439882464"

  print(paste("AIC for Model 1b (with PCA_animal_Dim1):", aic_fit1b))

## [1] "AIC for Model 1b (with PCA_animal_Dim1): 104.463977194883"

  print(paste("AIC for Model 1c (with wall/roof covering):", aic_fit1c))

## [1] "AIC for Model 1c (with wall/roof covering): 64.2754498199873"

  # Note: VIF computation with logistf is not directly supported, 
  # so we use a standard logistic regression model to compute VIF
  # Fit a standard logistic regression model for VIF calculation
  fit_standard <- glm(INF ~ wall_covering + roof_covering + chicken_coop + objects + dog_num, 
                      data = dbEV_mod1, 
                      family = binomial, 
                      na.action = na.omit)
  
  # Compute VIF using the car package
  vif_values <- car::vif(fit_standard)
  
  # Display the VIF values
  print(vif_values)

## wall_covering roof_covering  chicken_coop       objects       dog_num 
##      1.329212      1.213121      2.298461      1.138195      2.097412

  #Multi-model inference and model averaging
  dbEV_mod1 <- select(dbEV, INF, chicken_coop, objects, wall_covering, roof_covering, PCA_animal_Dim1)
  dbEV_mod1 <- dbEV_mod1[complete.cases(dbEV_mod1), ]
  table(dbEV$INF)

## 
##   0   1 
## 181  20

  fit1c<-logistf(data = dbEV_mod1, INF ~ wall_covering + roof_covering + chicken_coop + objects + PCA_animal_Dim1, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
  
  summary(fit1c)

## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop + 
##     objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.fail)
## 
## Model fitted by Penalized ML
## Coefficients:
##                       coef  se(coef) lower 0.95 upper 0.95     Chisq
## (Intercept)     -2.0414706 1.0269912 -4.6838513 -0.1764189 4.7023509
## wall_covering1  -1.7176185 0.8965726 -3.5838011  0.2544595 2.9861507
## roof_covering1  -0.3327281 0.8327502 -2.2802046  1.4376450 0.1376429
## chicken_coop1    3.8189131 1.3448303  1.3323207  7.1217844 9.5952001
## objects1         1.2457615 0.9313826 -0.5839831  3.6281952 1.7241705
## PCA_animal_Dim1 -0.5015937 0.4549452 -1.7308895  0.3717814 1.1538464
##                           p method
## (Intercept)     0.030121390      2
## wall_covering1  0.083979577      2
## roof_covering1  0.710635645      2
## chicken_coop1   0.001950867      2
## objects1        0.189157097      2
## PCA_animal_Dim1 0.282745410      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=16.87287 on 5 df, p=0.004747313, n=104
## Wald test = 37.29731 on 5 df, p = 0.0000005221187

  MMI <- dredge(fit1c)

## Fixed term is "(Intercept)"

MMI

## Global model call: logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop + 
##     objects + PCA_animal_Dim1, data = dbEV_mod1, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.fail)
## ---
## Model selection table 
##     (Int) chc_cop obj PCA_anm_Dm1 rof_cvr wll_cvr df  logLik AICc delta weight
## 5  -2.380                 0.23660                  2 -26.133 56.4  0.00  0.147
## 1  -2.429                                          1 -27.192 56.4  0.04  0.144
## 13 -1.986                 0.20380       +          3 -25.778 57.8  1.41  0.072
## 9  -2.010                               +          2 -26.852 57.8  1.44  0.072
## 3  -3.476           +                              2 -26.853 57.8  1.44  0.071
## 7  -3.414           +     0.05883                  3 -25.815 57.9  1.49  0.070
## 21 -1.107                 0.29820               +  3 -26.303 58.8  2.46  0.043
## 17 -1.224                                       +  2 -27.380 58.9  2.49  0.042
## 11 -3.064           +                   +          3 -26.524 59.3  2.90  0.034
## 15 -3.001           +     0.07994       +          4 -25.468 59.3  2.95  0.034
## 2  -2.844       +                                  2 -27.700 59.5  3.13  0.031
## 6  -2.965       +        -0.47970                  3 -26.697 59.6  3.25  0.029
## 19 -2.210           +                           +  3 -27.020 60.3  3.89  0.021
## 29 -0.958                 0.27850       +       +  4 -25.948 60.3  3.92  0.021
## 25 -1.072                               +       +  3 -27.036 60.3  3.93  0.021
## 23 -2.103           +     0.13560               +  4 -25.966 60.3  3.95  0.020
## 10 -2.401       +                       +          3 -27.340 60.9  4.53  0.015
## 4  -3.476       +   +                              3 -27.363 61.0  4.58  0.015
## 14 -2.489       +        -0.41190       +          4 -26.323 61.0  4.66  0.014
## 8  -3.805       +   +    -0.61440                  4 -26.367 61.1  4.75  0.014
## 27 -2.015           +                   +       +  4 -26.690 61.8  5.40  0.010
## 31 -1.894           +     0.15630       +       +  5 -25.620 61.9  5.47  0.010
## 18 -1.224       +                               +  3 -27.869 62.0  5.59  0.009
## 22 -1.420       +        -0.48390               +  4 -26.849 62.1  5.72  0.008
## 12 -2.977       +   +                   +          4 -27.015 62.4  6.05  0.007
## 16 -3.292       +   +    -0.51930       +          5 -26.001 62.6  6.23  0.007
## 26 -1.070       +                       +       +  4 -27.507 63.4  7.03  0.004
## 20 -1.869       +   +                           +  4 -27.513 63.4  7.04  0.004
## 30 -1.258       +        -0.38820       +       +  5 -26.476 63.6  7.18  0.004
## 24 -2.237       +   +    -0.58210               +  5 -26.501 63.6  7.23  0.004
## 28 -1.660       +   +                   +       +  5 -27.165 64.9  8.56  0.002
## 32 -2.041       +   +    -0.50160       +       +  6 -26.138 65.1  8.76  0.002
## Models ranked by AICc(x)

  write.table(MMI, "clipboard", sep="\t")
  
  MAS <- model.avg(MMI)
  summary(MAS)

## 
## Call:
## model.avg(object = MMI)
## 
## Component model call: 
## logistf(formula = INF ~ <32 unique rhs>, data = dbEV_mod1, control = 
##      logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
## 
## Component models: 
##        df logLik  AICc delta weight
## 3       2 -26.13 56.38  0.00   0.15
## (Null)  1 -27.19 56.42  0.04   0.14
## 34      3 -25.78 57.80  1.41   0.07
## 4       2 -26.85 57.82  1.44   0.07
## 2       2 -26.85 57.82  1.44   0.07
## 23      3 -25.82 57.87  1.49   0.07
## 35      3 -26.30 58.85  2.46   0.04
## 5       2 -27.38 58.88  2.49   0.04
## 24      3 -26.52 59.29  2.90   0.03
## 234     4 -25.47 59.34  2.95   0.03
## 1       2 -27.70 59.52  3.13   0.03
## 13      3 -26.70 59.63  3.25   0.03
## 25      3 -27.02 60.28  3.89   0.02
## 345     4 -25.95 60.30  3.92   0.02
## 45      3 -27.04 60.31  3.93   0.02
## 235     4 -25.97 60.34  3.95   0.02
## 14      3 -27.34 60.92  4.53   0.02
## 12      3 -27.36 60.97  4.58   0.01
## 134     4 -26.32 61.05  4.66   0.01
## 123     4 -26.37 61.14  4.75   0.01
## 245     4 -26.69 61.78  5.40   0.01
## 2345    5 -25.62 61.85  5.47   0.01
## 15      3 -27.87 61.98  5.59   0.01
## 135     4 -26.85 62.10  5.72   0.01
## 124     4 -27.01 62.43  6.05   0.01
## 1234    5 -26.00 62.62  6.23   0.01
## 145     4 -27.51 63.42  7.03   0.00
## 125     4 -27.51 63.43  7.04   0.00
## 1345    5 -26.48 63.56  7.18   0.00
## 1235    5 -26.50 63.61  7.23   0.00
## 1245    5 -27.16 64.94  8.56   0.00
## 12345   6 -26.14 65.14  8.76   0.00
## 
## Term codes: 
##    chicken_coop         objects PCA_animal_Dim1   roof_covering   wall_covering 
##               1               2               3               4               5 
## 
## Model-averaged coefficients:  
## (full average) 
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -2.39515    0.96830   2.474   0.0134 *
## PCA_animal_Dim1  0.03872    0.30274   0.128   0.8982  
## roof_covering1  -0.26244    0.58637   0.448   0.6545  
## objects1         0.48258    0.87524   0.551   0.5814  
## wall_covering1  -0.32967    0.73992   0.446   0.6559  
## chicken_coop1    0.59643    1.42269   0.419   0.6750  
##  
## (conditional average) 
##                 Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     -2.39515    0.96830   2.474  0.01338 * 
## PCA_animal_Dim1  0.07782    0.42562   0.183  0.85492   
## roof_covering1  -0.80018    0.78615   1.018  0.30875   
## objects1         1.48879    0.93020   1.601  0.10948   
## wall_covering1  -1.46508    0.87724   1.670  0.09490 . 
## chicken_coop1    3.52803    1.27670   2.763  0.00572 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

  sw(MAS)

##                      PCA_animal_Dim1 roof_covering objects wall_covering
## Sum of weights:      0.50            0.33          0.32    0.23         
## N containing models:   16              16            16      16         
##                      chicken_coop
## Sum of weights:      0.17        
## N containing models:   16

  #Coefficients and confidence intervals
  ci<-confint(MAS)
  
  # Extracting p-values from the summary of the conditional average model
  summary_MAS <- summary(MAS)
  pval <- summary_MAS$coefmat.subset[, "Pr(>|z|)"]
  
  # Combining OR, CI, and p-values into a table
  table.coef <- cbind(exp(cbind (OR=coef(MAS),ci)),pval)
  print(table.coef, digits = 3)

##                      OR  2.5 %  97.5 %    pval
## (Intercept)      0.0912 0.0137   0.608 0.01338
## PCA_animal_Dim1  1.0809 0.4694   2.489 0.85492
## roof_covering1   0.4492 0.0962   2.097 0.30875
## objects1         4.4317 0.7158  27.438 0.10948
## wall_covering1   0.2311 0.0414   1.290 0.09490
## chicken_coop1   34.0568 2.7892 415.844 0.00572

  write.table(table.coef, "clipboard", sep="\t")

Of all the models (1a-1c), the model including wall and roof covering instead of house type had a smaller AIC. However, when performing model averaging, only the presence of chicken coop was significantly associated to infestation and there are 10 models within 2 delta AIC including the null model, indicating that the model is not robust and there is still significant variance not solely explained by the presence of chicken coops.

Infestation vs. spacial variables.

  dbEV_mod2 <- select(dbEV, INF, dist_lum, num_S_infest_50m, num_S_infest_100m, dist.infest)
  dbEV_mod2 <- dbEV_mod2[complete.cases(dbEV_mod2), ]
  
  fit2<-logistf(data=dbEV_mod2, INF ~ dist_lum + num_S_infest_100m, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
  summary(fit2)

## logistf(formula = INF ~ dist_lum + num_S_infest_100m, data = dbEV_mod2, 
##     control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
## 
## Model fitted by Penalized ML
## Coefficients:
##                          coef   se(coef)   lower 0.95  upper 0.95     Chisq
## (Intercept)       -3.67981361 0.51352773 -4.795538408 -2.74081376       Inf
## dist_lum           0.01981901 0.01325445 -0.009467053  0.04504547  1.902084
## num_S_infest_100m  1.45624804 0.28186004  0.934998231  2.07127192 33.502979
##                                   p method
## (Intercept)       0.000000000000000      2
## dist_lum          0.167845265056839      2
## num_S_infest_100m 0.000000007115494      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=35.62756 on 2 df, p=0.00000001834739, n=201
## Wald test = 71.67107 on 2 df, p = 0.0000000000000002220446

  # Fit a standard logistic regression model for VIF calculation
  fit_standard <- glm(INF ~ dist_lum + num_S_infest_50m + num_S_infest_100m + dist.infest, 
                      data = dbEV_mod2, 
                      family = binomial, 
                      na.action = na.fail)
  
  # Compute VIF using the car package
  vif_values <- car::vif(fit_standard)
  
  # Display the VIF values
  print(vif_values)

##          dist_lum  num_S_infest_50m num_S_infest_100m       dist.infest 
##          1.090275         21.346858         17.628302          2.126020

  # Extract confidence intervals
  ci <- confint(fit2)

  # Extract odds ratios (OR)
  or <- exp(coef(fit2))
  
  # Extract p-values
  p_values <- fit2$prob
  
  # Combine OR, confidence intervals, and p-values into one table
  table.coef <- cbind(OR = or, ci, P.value = p_values)
  
  # Print the table with specified number of digits
  print(table.coef, format = "f", digits = 3)

##                       OR Lower 95% Upper 95%       P.value
## (Intercept)       0.0252  -4.79554    -2.741 0.00000000000
## dist_lum          1.0200  -0.00947     0.045 0.16784526506
## num_S_infest_100m 4.2898   0.93500     2.071 0.00000000712

 # Convert the table to a data frame for better handling
  table.coef <- as.data.frame(table.coef)

  # Print the formatted table
  print(table.coef)

##                           OR    Lower 95%   Upper 95%           P.value
## (Intercept)       0.02522768 -4.795538408 -2.74081376 0.000000000000000
## dist_lum          1.02001671 -0.009467053  0.04504547 0.167845265056839
## num_S_infest_100m 4.28983400  0.934998231  2.07127192 0.000000007115494

  # Optionally, write the table to the clipboard
  write.table(table.coef, "clipboard", sep = "\t", col.names = NA, row.names = TRUE)

From the bivariate analyses, we decided to include the distance to street lights, the number of infested houses to the South within 50 and 100m radius and the distance to the nearest infested house. Because of multicollinearity between the last thre variables, we decided to continue only with the number of infested houses within 100m to the South.

3.Combined model

#Modelo espacial
  
  dbEV_mod3 <- select(dbEV, INF, chicken_coop, objects, wall_covering, roof_covering, PCA_animal_Dim1, num_S_infest_100m, dist_lum)
  dbEV_mod3 <- dbEV_mod3[complete.cases(dbEV_mod3), ]
  
  fit3<-logistf(data=dbEV_mod3, INF ~ wall_covering + roof_covering + chicken_coop + num_S_infest_100m, control = logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
  summary(fit3)

## logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop + 
##     num_S_infest_100m, data = dbEV_mod3, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.fail)
## 
## Model fitted by Penalized ML
## Coefficients:
##                        coef  se(coef) lower 0.95 upper 0.95     Chisq
## (Intercept)       -2.060508 0.8575915 -4.1574581 -0.4863096  6.927464
## wall_covering1    -1.915892 1.0382381 -4.2376714  0.2286594  3.101711
## roof_covering1    -2.115163 1.2058849 -5.4616362  0.1248173  3.371579
## chicken_coop1      4.221911 1.2874148  1.8215968  7.6463463 12.369084
## num_S_infest_100m  1.771262 0.5764847  0.7117322  3.4275353 11.649284
##                              p method
## (Intercept)       0.0084881974      2
## wall_covering1    0.0782100603      2
## roof_covering1    0.0663301411      2
## chicken_coop1     0.0004365018      2
## num_S_infest_100m 0.0006422726      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=26.10362 on 4 df, p=0.0000301581, n=104
## Wald test = 23.85293 on 4 df, p = 0.00008548336

  #Coefficients and confidence intervals
  ci<-confint(fit3)
  table.coef <- exp(cbind (OR=coef(fit3),ci))
  print(table.coef, digits = 3)

##                       OR Lower 95% Upper 95%
## (Intercept)        0.127   0.01565     0.615
## wall_covering1     0.147   0.01444     1.257
## roof_covering1     0.121   0.00425     1.133
## chicken_coop1     68.164   6.18172  2092.984
## num_S_infest_100m  5.878   2.03752    30.801

  write.table(table.coef, "clipboard", sep="\t")
  
  MMI <- dredge(fit3)

## Fixed term is "(Intercept)"

MMI

## Global model call: logistf(formula = INF ~ wall_covering + roof_covering + chicken_coop + 
##     num_S_infest_100m, data = dbEV_mod3, control = logistf.control(maxit = 10000, 
##     maxstep = 1), na.action = na.fail)
## ---
## Model selection table 
##     (Int) chc_cop num_S_inf_100 rof_cvr wll_cvr df  logLik AICc delta weight
## 1  -2.429                                        1 -27.192 56.4  0.00  0.244
## 3  -3.075                0.9529                  2 -26.438 57.0  0.57  0.183
## 5  -2.010                             +          2 -26.852 57.8  1.40  0.121
## 7  -2.703                1.4540       +          3 -26.085 58.4  1.99  0.090
## 9  -1.224                                     +  2 -27.380 58.9  2.45  0.072
## 11 -1.804                1.0110               +  3 -26.605 59.5  3.03  0.054
## 2  -2.844       +                                2 -27.700 59.5  3.10  0.052
## 4  -3.691       +        1.0960                  3 -26.927 60.1  3.67  0.039
## 13 -1.072                             +       +  3 -27.036 60.3  3.89  0.035
## 15 -1.829                1.4310       +       +  4 -26.251 60.9  4.48  0.026
## 6  -2.401       +                     +          3 -27.340 60.9  4.50  0.026
## 8  -3.382       +        1.7030       +          4 -26.555 61.5  5.09  0.019
## 10 -1.224       +                             +  3 -27.869 62.0  5.56  0.015
## 12 -1.988       +        1.2850               +  4 -27.077 62.6  6.13  0.011
## 14 -1.070       +                     +       +  4 -27.507 63.4  6.99  0.007
## 16 -2.061       +        1.7710       +       +  5 -26.706 64.0  7.60  0.005
## Models ranked by AICc(x)

  write.table(MMI, "clipboard", sep="\t")
  
  MAS <- model.avg(MMI)
  summary(MAS)

## 
## Call:
## model.avg(object = MMI)
## 
## Component model call: 
## logistf(formula = INF ~ <16 unique rhs>, data = dbEV_mod3, control = 
##      logistf.control(maxit = 10000, maxstep = 1), na.action = na.fail)
## 
## Component models: 
##        df logLik  AICc delta weight
## (Null)  1 -27.19 56.42  0.00   0.24
## 2       2 -26.44 57.00  0.57   0.18
## 3       2 -26.85 57.82  1.40   0.12
## 23      3 -26.08 58.41  1.99   0.09
## 4       2 -27.38 58.88  2.45   0.07
## 24      3 -26.61 59.45  3.03   0.05
## 1       2 -27.70 59.52  3.10   0.05
## 12      3 -26.93 60.09  3.67   0.04
## 34      3 -27.04 60.31  3.89   0.03
## 234     4 -26.25 60.91  4.48   0.03
## 13      3 -27.34 60.92  4.50   0.03
## 123     4 -26.55 61.51  5.09   0.02
## 14      3 -27.87 61.98  5.56   0.02
## 124     4 -27.08 62.56  6.13   0.01
## 134     4 -27.51 63.42  6.99   0.01
## 1234    5 -26.71 64.02  7.60   0.01
## 
## Term codes: 
##      chicken_coop num_S_infest_100m     roof_covering     wall_covering 
##                 1                 2                 3                 4 
## 
## Model-averaged coefficients:  
## (full average) 
##                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)        -2.3917     0.8495   2.815  0.00487 **
## num_S_infest_100m   0.4970     0.6518   0.763  0.44573   
## roof_covering1     -0.4853     0.9571   0.507  0.61209   
## wall_covering1     -0.3302     0.7547   0.438  0.66167   
## chicken_coop1       0.5992     1.3749   0.436  0.66296   
##  
## (conditional average) 
##                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)        -2.3917     0.8495   2.815  0.00487 **
## num_S_infest_100m   1.1608     0.4709   2.465  0.01370 * 
## roof_covering1     -1.4699     1.1520   1.276  0.20197   
## wall_covering1     -1.4648     0.9296   1.576  0.11508   
## chicken_coop1       3.4223     1.0654   3.212  0.00132 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

  sw(MAS)

##                      num_S_infest_100m roof_covering wall_covering chicken_coop
## Sum of weights:      0.43              0.33          0.23          0.18        
## N containing models:    8                 8             8             8

  #Coefficients and confidence intervals
  ci<-confint(MAS)
  
  # Extracting p-values from the summary of the conditional average model
  summary_MAS <- summary(MAS)
  pval <- summary_MAS$coefmat.subset[, "Pr(>|z|)"]
  
  # Combining OR, CI, and p-values into a table
  table.coef <- cbind(exp(cbind (OR=coef(MAS),ci)),pval)
  print(table.coef, digits = 3)

##                        OR  2.5 %  97.5 %    pval
## (Intercept)        0.0915 0.0173   0.483 0.00487
## num_S_infest_100m  3.1924 1.2685   8.034 0.01370
## roof_covering1     0.2299 0.0240   2.199 0.20197
## wall_covering1     0.2311 0.0374   1.429 0.11508
## chicken_coop1     30.6403 3.7970 247.254 0.00132

  write.table(table.coef, "clipboard", sep="\t")

Rivadavia_Chagas

MPF

2024-08-29