```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{R message=FALSE, warning=FALSE}
## clear workspace
rm(list=ls())
## Load Packages
load.lib<-c("gptstudio","tidyverse") # List of required packages
install.lib <- load.lib[!load.lib %in% installed.packages()] # Select missing packages
for(lib in install.lib) install.packages(lib,dependencies=TRUE) # Install missing packages + dependencies
sapply(load.lib,require,character=TRUE) # Load all packages.
```
# plot to improve
```{r}
ggplot(data = iris, aes(Sepal.Length,Petal.Length)) + geom_point()
```
# result of using chatGPT
```{r}
require(tidyverse)
require(scales)
# Improve this plot by using a non standard theme, coloring by species, and adding a second y-axis that shows Sepal.Width
ggplot(data = iris, aes(Sepal.Length,Petal.Length)) + geom_point()
# To apply a non-standard theme, add the theme() function and set the theme you want to use
# To color by species, add the color parameter inside the aes() function and set it to the name of the column that contains the species
# To add a second y-axis, use the sec.axis parameter inside scale_y_continuous() and set it equal to a call to the function sec_axis() which specifies the transformation to the second y-axis.
library(ggplot2)
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point() +
theme_bw() + # choose a black and white theme
scale_color_brewer(palette = "Set1") + # choose a color scheme for the species
labs(x = "Sepal Length", y = "Petal Length") +
scale_y_continuous(sec.axis = sec_axis(~.*1, name = "Sepal Width")) # set up the second y axis to show Sepal Width
```
```{r}
# Result:
# Set theme to "dark"
# Color by species using `color = species`
# Add `scale_y_continuous()` with `sec.axis = sec_axis()`
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point() +
scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) +
theme_dark() +
scale_y_continuous(sec.axis = sec_axis(~., name = "Sepal Width",
breaks = pretty_breaks()))
```
```{r}
require(ggpmisc)
# Improve this plot by removing gridlines, setting the minimum of each y axis to zero, make the plot background white, increase the size of points, add a linear regression that plots relationship between Sepal.Length and Petal.Length across all species, and add R-squared value for linear regression to the plot.
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point() +
scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) +
theme_dark() +
scale_y_continuous(sec.axis = sec_axis(~., name = "Sepal Width",
breaks = pretty_breaks()))
# Solution:
# Add the following code to the existing code:
# Remove gridlines
# Set minimum of y-axis to zero
# Set the plot background to white
# Increase size of points
# Add linear regression line and R-squared value
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point(size = 4) +
scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) +
theme_light() + # Change theme to light
scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width",
breaks = pretty_breaks())) + # Change y axis limit
theme(panel.grid = element_blank(), # Remove gridlines
plot.background = element_rect(fill = "white")) + # Set plot background to white
geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line
aes(group = 1), se = FALSE) +
stat_poly_eq(formula = y ~ x, aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, label.x.npc = "right", label.y.npc = 0.9) # Add R-squared value for linear regression.
```
```{r}
# Improve this plot by:
# 1. Change the color of the points to three different hues of purple
# 2. Make the axis labels and axis values larger
# 3. Make the border and axis lines thicker
# 4. Move the output of stat_poly_eq to the top left quadrant of the graph
# 5. Make the regression line black; make sure it appears behind the data points
# 6. Do not paste the equation of the linear regression, just add the R-square value
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point(size = 4) +
scale_color_manual(values = c("#FF0000", "#00FF00", "#0000FF")) +
theme_light() + # Change theme to light
scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width",
breaks = pretty_breaks())) + # Change y axis limit
theme(panel.grid = element_blank(), # Remove gridlines
plot.background = element_rect(fill = "white")) + # Set plot background to white
geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line
aes(group = 1), se = FALSE) +
stat_poly_eq(formula = y ~ x, aes(label = paste(..eq.label.., ..rr.label.., sep="~~~")),
parse = TRUE, label.x.npc = "right", label.y.npc = 0.9) # Add R-squared value for linear regression.
# Solution
ggplot(data = iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point(size = 4, alpha = 0.8) + # Set alpha to 0.8 for better visibility
scale_color_manual(values = c("#A040A0", "#7B1FA2", "#4A148C")) + # Change to three hues of purple
theme_light(base_size = 16, base_line_size = 1.2) + # Increase base size and line thickness
scale_y_continuous(limits = c(0, NA), sec.axis = sec_axis(~., name = "Sepal Width",
breaks = pretty_breaks())) + # Leave y axis limit unchanged
theme(panel.grid = element_line(size = 1.2), plot.background = element_rect(fill = "white", color = "black")) + # Increase border and grid line thickness, and set plot background to white with a black border
geom_smooth(method = "lm", formula = y ~ x, # Add linear regression line
aes(group = 1), se = FALSE, color = "black") + # Make the regression line black
annotate("text", label = paste0("R-squared: ", signif(summary(lm(Petal.Length ~ Sepal.Length, data = iris))$r.squared, 3)), x = min(iris$Sepal.Length), y = max(iris$Petal.Length), vjust=1, hjust=0) # Add R-squared value for linear regression, positioned at top left
```
```{r}
# using the iris dataset,
# cluster data into three groups based on petal length and sepal width
# Begin by loading the iris dataset that comes pre-installed in R.
data(iris)
# Create a subset of the iris dataset including only the variables petal length and sepal width.
iris_subset <- iris[c("Petal.Length", "Sepal.Width")]
# Use the kmeans function to create three clusters based on the iris_subset.
kmeans_clusters <- kmeans(iris_subset, centers = 3)
# Plot the data points colored by their assigned cluster.
plot(iris_subset, col = kmeans_clusters$cluster)
```
```{r}
# Old
# improve upon this plot, use a different symbol for each species
# Begin by loading the iris dataset that comes pre-installed in R.
data(iris)
# Create a subset of the iris dataset including only the variables petal length and sepal width.
iris_subset <- iris[c("Petal.Length", "Sepal.Width")]
# Use the kmeans function to create three clusters based on the iris_subset.
kmeans_clusters <- kmeans(iris_subset, centers = 3)
# output
# Plot the data points colored by their assigned cluster.
plot(iris_subset, col = kmeans_clusters$cluster)
# Create a vector of symbols with one symbol for each species.
symbols <- c(16, 17, 18)
# Use the pch argument to specify the symbol based on species in the plot() function.
plot(iris_subset, col = kmeans_clusters$cluster, pch = symbols[iris$Species])
# Add a legend to the plot to show which symbol corresponds to which species.
legend("topright", legend = levels(iris$Species), pch = symbols, col = 1:length(symbols))
```
```{r}
# Improve this plot by adding ellipses around each cluster center for the groups in this plot
plot(iris_subset, col = kmeans_clusters$cluster, pch = symbols[iris$Species])
legend("topright", legend = levels(iris$Species), pch = symbols, col = 1:length(symbols))
# First, we need to compute the cluster centers
centers <- aggregate(. ~ cluster, data = iris_subset, mean)
# Then we can add ellipses around each center with the 'ellipse' function from the 'car' package
library(car)
for (i in 1:nrow(centers)) {
center <- centers[i, 2:3]
covmat <- cov(iris_subset[kmeans_clusters$cluster == centers[i, 1], 1:2])
draw.ellipse(center, covmat, border = "black", lty = "dashed")
}
# Finally, we can plot the original points
plot(iris_subset, col = kmeans_clusters$cluster,
pch = symbols[iris$Species])
legend("topright", legend = levels(iris$Species),
pch = symbols, col = 1:length(symbols))
```