density estimate

A kernel density estimate, useful for display the distribution of variables with some underlying smoothness.

Example

In order to use this kind of plot, you need to install the ggplot2 package.

At the beginning, let's use ggplot to show the distribution of the SCORE column.

1. Let's choose proper data. (don't run this code)

ggplot(moody)

We use the dataset "moody" to do the analysis.

2. Tell ggplot which particular column we want to analyze, in this example it is the SCORE column (don't run this code)

ggplot(moody,aes(SCORE))

aes(the data that you want to use for the value of X-axis)

3. Show the density distribution of the data you want to analyze.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE))+geom_density()
 #"geom_density()" can give you the density distribution

But in most cases, we may want to show the relationship between two data types, not just one, say maybe the relationship between numerical data and categorical data.

For example, say we want to show the distribution of SCORE based on GRADE.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE))+geom_density()
 #The color function not only adds different colors to different grade values but also separates scores into different groups based on different grades. aes means the aesthetic function.

If you want to adjust the curves to make them smoother, you can use the "adjust" function.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE))+geom_density(adjust=2)
 #the default value of adjust is 1

Or you may want to let the curve be more specified to each corresponding point.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE))+geom_density(adjust=1/5)

Maybe you only want to see the distribution in a given area in xlim.

For example I only want to analyze the distribution of scores between 60-100.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE))+geom_density()+xlim(60,100)
 #xlim(the range of score)

You can fill colors inside the curves by using "fill"

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE,fill=GRADE))+geom_density()

Bothered by the overlap of color?

Then you can adjust the transparency of the colors by changing the value of "alpha".

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE,fill=GRADE))+geom_density(alpha=0.1)

Or you can generate the stacked density plot by using position="stack"

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=GRADE,fill=GRADE))+geom_density(position="stack")

You can also choose not to show the density but to count the number of elements with the values of x.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,..count..,color=GRADE,fill=GRADE))+geom_density(position="stack")
 #"..count.." means counting the number rather than showing the density.

"position = "fill" " makes it easier for people to compare the proportion.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,..count..,color=GRADE,fill=GRADE))+geom_density(position="fill")

Finally, you may want to consider A B C in one category, in other words only two categories: "F" and "not F"

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,color=(GRADE=="F"),fill=(GRADE=="F")))+geom_density()
 #"GRADE=="F" is a judgement, so ggplot will fill different colors based on whether GRADE = F is TRUE or FALSE

We can also change the context of the legend in the density plot.

 moody <- read.csv("https://raw.githubusercontent.com/kunal0895/RDatasets/master/Moody2018.csv")
 library(ggplot2)
 ggplot(moody,aes(SCORE,fill=(GRADE=="F")))+geom_density()+labs(fill="F or not")
 #remember to delete the color function in this case, otherwise you will have two legends.

ggplot(data, aes())+...