```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{R message=FALSE, warning=FALSE} ## clear workspace rm(list=ls()) ## Load Packages load.lib<-c("gptstudio","tidyverse") # List of required packages install.lib <- load.lib[!load.lib %in% installed.packages()] # Select missing packages for(lib in install.lib) install.packages(lib,dependencies=TRUE) # Install missing packages + dependencies sapply(load.lib,require,character=TRUE) # Load all packages. ``` # plot to improve ```{r} ggplot(data = iris, aes(Sepal.Length,Petal.Length)) + geom_point() ``` # result of using chatGPT ```{r} require(tidyverse) require(scales) # Improve this plot by using a non standard theme, coloring by species, and adding a second y-axis that shows Sepal.Width ggplot(data = iris, aes(Sepal.Length,Petal.Length)) + geom_point() # To apply a non-standard theme, add the theme() function and set the theme you want to use # To color by species, add the color parameter inside the aes() function and set it to the name of the column that contains the species # To add a second y-axis, use the sec.axis parameter inside scale_y_continuous() and set it equal to a call to the function sec_axis() which specifies the transformation to the second y-axis. library(ggplot2) ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point() + theme_bw() + # choose a black and white theme scale_color_brewer(palette = "Set1") + # choose a color scheme for the species labs(x = "Sepal Length", y = "Petal Length") + scale_y_continuous(sec.axis = sec_axis(~.*1, name = "Sepal Width")) # set up the second y axis to show Sepal Width ``` ```{r} # Result: # Set theme to "dark" # Color by species using `color = species` # Add `scale_y_continuous()` with `sec.axis = sec_axis()` ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point() + scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) + theme_dark() + scale_y_continuous(sec.axis = sec_axis(~., name = "Sepal Width", breaks = pretty_breaks())) ``` ```{r} require(ggpmisc) # Improve this plot by removing gridlines, setting the minimum of each y axis to zero, make the plot background white, increase the size of points, add a linear regression that plots relationship between Sepal.Length and Petal.Length across all species, and add R-squared value for linear regression to the plot. ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point() + scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) + theme_dark() + scale_y_continuous(sec.axis = sec_axis(~., name = "Sepal Width", breaks = pretty_breaks())) # Solution: # Add the following code to the existing code: # Remove gridlines # Set minimum of y-axis to zero # Set the plot background to white # Increase size of points # Add linear regression line and R-squared value ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point(size = 4) + scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) + theme_light() + # Change theme to light scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width", breaks = pretty_breaks())) + # Change y axis limit theme(panel.grid = element_blank(), # Remove gridlines plot.background = element_rect(fill = "white")) + # Set plot background to white geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line aes(group = 1), se = FALSE) + stat_poly_eq(formula = y ~ x, aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")), parse = TRUE, label.x.npc = "right", label.y.npc = 0.9) # Add R-squared value for linear regression. ``` ```{r} # Improve this plot by: # 1. Change the color of the points to three different hues of purple # 2. Make the axis labels and axis values larger # 3. Make the border and axis lines thicker # 4. Move the output of stat_poly_eq to the top left quadrant of the graph # 5. Make the regression line black; make sure it appears behind the data points # 6. Do not paste the equation of the linear regression, just add the R-square value ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point(size = 4) + scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) + theme_light() + # Change theme to light scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width", breaks = pretty_breaks())) + # Change y axis limit theme(panel.grid = element_blank(), # Remove gridlines plot.background = element_rect(fill = "white")) + # Set plot background to white geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line aes(group = 1), se = FALSE) + stat_poly_eq(formula = y ~ x, aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")), parse = TRUE, label.x.npc = "right", label.y.npc = 0.9) # Add R-squared value for linear regression. # Solution ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) + geom_point(size = 4, alpha = 0.8) + # Set alpha to 0.8 for better visibility scale_color_manual(values = c("#A040A0", "#7B1FA2", "#4A148C")) + # Change to three hues of purple theme_light(base_size = 16, base_line_size = 1.2) + # Increase base size and line thickness scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width", breaks = pretty_breaks())) + # Leave y axis limit unchanged theme(panel.grid = element_line(size = 1.2), plot.background = element_rect(fill = "white", color = "black")) + # Increase border and grid line thickness, and set plot background to white with a black border geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line aes(group = 1), se = FALSE, color = "black") + # Make the regression line black annotate("text", label = paste0("R-squared: ", signif(summary(lm(Petal.Length ~ Sepal.Length, data = iris))$r.squared, 3)), x = min(iris$Sepal.Length), y = max(iris$Petal.Length), vjust=1, hjust=0) # Add R-squared value for linear regression, positioned at top left ``` ```{r} # using the iris dataset, # cluster data into three groups based on petal length and sepal width # Begin by loading the iris dataset that comes pre-installed in R. data(iris) # Create a subset of the iris dataset including only the variables petal length and sepal width. iris_subset <- iris[c("Petal.Length", "Sepal.Width")] # Use the kmeans function to create three clusters based on the iris_subset. kmeans_clusters <- kmeans(iris_subset, centers = 3) # Plot the data points colored by their assigned cluster. plot(iris_subset, col = kmeans_clusters$cluster) ``` ```{r} # Old # improve upon this plot, use a different symbol for each species # Begin by loading the iris dataset that comes pre-installed in R. data(iris) # Create a subset of the iris dataset including only the variables petal length and sepal width. iris_subset <- iris[c("Petal.Length", "Sepal.Width")] # Use the kmeans function to create three clusters based on the iris_subset. kmeans_clusters <- kmeans(iris_subset, centers = 3) # output # Plot the data points colored by their assigned cluster. plot(iris_subset, col = kmeans_clusters$cluster) # Create a vector of symbols with one symbol for each species. symbols <- c(16, 17, 18) # Use the pch argument to specify the symbol based on species in the plot() function. plot(iris_subset, col = kmeans_clusters$cluster, pch = symbols[iris$Species]) # Add a legend to the plot to show which symbol corresponds to which species. legend("topright", legend = levels(iris$Species), pch = symbols, col = 1:length(symbols)) ``` ```{r} # Improve this plot by adding ellipses around each cluster center for the groups in this plot plot(iris_subset, col = kmeans_clusters$cluster, pch = symbols[iris$Species]) legend("topright", legend = levels(iris$Species), pch = symbols, col = 1:length(symbols)) # First, we need to compute the cluster centers centers <- aggregate(. ~ cluster, data = iris_subset, mean) # Then we can add ellipses around each center with the 'ellipse' function from the 'car' package library(car) for (i in 1:nrow(centers)) { center <- centers[i, 2:3] covmat <- cov(iris_subset[kmeans_clusters$cluster == centers[i, 1], 1:2]) draw.ellipse(center, covmat, border = "black", lty = "dashed") } # Finally, we can plot the original points plot(iris_subset, col = kmeans_clusters$cluster, pch = symbols[iris$Species]) legend("topright", legend = levels(iris$Species), pch = symbols, col = 1:length(symbols)) ```