library(ggplot2) library(dplyr) library(tidyr) library(faux) num_simulations = 10000 num_subj = 50 true_cor = 0.3 noise_sd = 1 correlations_without_error = numeric(num_simulations) correlations_with_error = numeric(num_simulations) significant_correlations_with_error = numeric(0) significant_correlations_without_error = numeric(0) set.seed(123) for (i in 1:num_simulations) { data <- rnorm_multi(n=num_subj, mu = c(0,0), sd= c(5,5), r=true_cor, varnames = c("x","y")) x <- data$x y <- data$y y_noisy <- y + rnorm(num_subj, mean=0, sd=noise_sd) correlations_with_error[i] <- cor(x, y_noisy) correlations_without_error[i] <- cor(x, y) if (cor.test(x,y)$p.value < 0.05) { significant_correlations_without_error <- c(significant_correlations_without_error, abs(cor(x,y))) } if (cor.test(x, y_noisy)$p.value < 0.05) { significant_correlations_with_error <- c(significant_correlations_with_error, abs(cor(x,y_noisy))) } } cat("=== No error scenario ===\n") cat("Median estimate (only significant):", median(abs(significant_correlations_without_error)), "\n") cat("Standard deviation of the estimate (only significant):", sd(significant_correlations_without_error), "\n") cat("Median estimate (total):", median(abs(correlations_without_error)), "\n") cat("Number of significant correlations:", length(significant_correlations_without_error), "\n\n") cat("=== Error scenario ===\n") cat("Median estmate (only significant):", median(abs(significant_correlations_with_error)), "\n") cat("Standard deviation of the estimate (only significant):", sd(significant_correlations_with_error), "\n") cat("Median estimate (total):", median(abs(correlations_with_error)), "\n") cat("Number of significant correlations:", length(significant_correlations_with_error), "\n") ########################################################################################### set.seed(123) num_simulations <- 1000 rhos <- seq(0.1, 0.6, by = 0.01) noise_sd <- 2 num_subject <- 50 result_df <- data.frame( rho = numeric(), correlation_without_error = numeric(), correlation_with_error = numeric() ) for (rho in rhos) { for (i in 1:num_simulations) { data <- rnorm_multi(n=num_subj, mu = c(0,0), sd= c(5,5), r=rho, varnames = c("x","y")) x <- data$x y <- data$y y_noisy <- y + rnorm(n=num_subj, mean=0, sd = noise_sd) cor_without_error <- cor(x, y) cor_with_error <- cor(x, y_noisy) if (cor.test(x, y_noisy)$p.value < 0.05) { result_df <- rbind(result_df, data.frame( rho = rho, correlation_without_error = cor_without_error, correlation_with_error = cor_with_error )) } } } # Plot ggplot(result_df, aes(x=correlation_without_error, y=correlation_with_error)) + geom_point(color="red", alpha=0.6, size=1) + geom_abline(intercept=0, slope=1, color="black", linewidth=1.5) + labs(title="Comparison of Correlations", x="Without Noise", y="With Noise") + theme_minimal() percentage_greater_with_noise <- mean(abs(result_df$correlation_with_error) > abs(result_df$correlation_without_error)) * 100 cat(sprintf("Percentage of simulations where the estimate is bigger when adding noise and selecting for statistical significance: %.2f%%\n", percentage_greater_with_noise)) ###################################################################################### set.seed(123) # N list N_values <- c(seq(15, 50, by=1)) percentages <- numeric(length(N_values)) true_cor <- 0.2 results_list <- list() start_time <- proc.time() for (j in 1:length(N_values)) { N <- N_values[j] num_simulations = 5000 estimates_with_noise <- numeric(0) estimates_without_noise <- numeric(0) p_values_with_noise <- numeric(0) p_values_without_noise <- numeric(0) for (i in 1:num_simulations) { data <- rnorm_multi(n=N, mu = c(0,0), sd= c(5,5), r= true_cor , varnames = c("x","y")) x <- data$x y <- data$y y_noisy <- y + rnorm(N, mean=0, sd=1) p_without_noise <- cor.test(x,y)$p.value p_with_noise <- cor.test(x, y_noisy)$p.value if (p_without_noise < 0.05 && p_with_noise < 0.05) { estimates_without_noise <- c(estimates_without_noise, cor(x,y)) estimates_with_noise <- c(estimates_with_noise, cor(x,y_noisy)) p_values_without_noise <- c(p_values_without_noise, p_without_noise) p_values_with_noise <- c(p_values_with_noise, p_with_noise) } } results_list[[as.character(N)]] <- list(estimates_without_noise=estimates_without_noise, estimates_with_noise=estimates_with_noise, p_values_without_noise=p_values_without_noise, p_values_with_noise=p_values_with_noise) percentage_greater_with_noise <- mean(estimates_with_noise > estimates_without_noise) * 100 percentages[j] <- percentage_greater_with_noise } end_time <- proc.time() total_runtime <- end_time - start_time print(total_runtime) average_runtime_per_simulation <- total_runtime[3] / (length(N_values) * num_simulations) print(average_runtime_per_simulation) plot_data <- data.frame(N = N_values, percentages = percentages) # Plot ggplot(plot_data, aes(x=N, y=percentages)) + geom_line(color="red") + geom_hline(yintercept=50, color="blue", linetype="dashed") + labs(title="% of bigger estimate when adding noise and selecting for statistical significance ", x="N", y="% ") + theme_minimal()