Color resources: https://sape.inf.usi.ch/quick-reference/ggplot2/colour https://r-graph-gallery.com/38-rcolorbrewers-palettes.html https://ggplot2.tidyverse.org/reference/ggtheme.html https://r-graph-gallery.com/79-levelplot-with-ggplot2.html
dataset_file <- "real_estate_texas_500_2024.csv"
#create_file <- "real_estate_texas_500_2024_ADDRESS.csv"
library(tidyverse)
library(mosaic)
library(lubridate)
library(reshape)
library(reshape2)
library(gcookbook)
library(scales)
library(mapproj)
library(zoo)
library(gridExtra)
library(RColorBrewer)
library(datasets)
library(ggforce)
library(ggbeeswarm)
library(ggmosaic)
library(magrittr)
library(scales)
library(tidyquant)
library(readr)
library(ggpubr)
library(stringr)
library(tinytex)
library(rmarkdown)
library(fastDummies)
library(caret)
library(dplyr)
library(RANN)
tx_data <- read.csv(dataset_file)
glimpse(tx_data)
## Rows: 501
## Columns: 14
## $ url <chr> "https://www.realtor.com/realestateandhomes-detail/104…
## $ status <chr> "for_sale", "for_sale", "for_sale", "for_sale", "for_s…
## $ id <dbl> 9773941616, 9224923922, 9840661824, 7338317229, 728584…
## $ listPrice <int> 240000, 379900, 370000, 444000, 569000, 875000, 214500…
## $ baths <int> 2, 4, 2, 4, 2, 5, 2, 3, 2, 2, 2, 6, 2, 3, 3, 2, 0, 2, …
## $ baths_full <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ baths_full_calc <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ beds <int> 3, 4, 4, 5, 3, 4, 4, 5, 4, 3, 3, 6, 3, 3, 4, 4, NA, 3,…
## $ sqft <int> 1190, 2033, 2062, 3705, 3282, 4873, 2260, 2109, 1896, …
## $ stories <int> 1, 1, 1, 2, 2, 2, NA, 1, 1, 1, NA, 2, 1, 1, 2, 1, NA, …
## $ sub_type <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ text <chr> "Welcome home to your peaceful retreat nestled on 2 ac…
## $ type <chr> "single_family", "single_family", "single_family", "si…
## $ year_built <int> 2018, 2002, 2012, 1985, 1981, 1999, 2020, 1956, 2000, …
# Checking count of all NAs in the tx_data
sum(is.na(tx_data))
## [1] 578
summary(tx_data)
## url status id listPrice
## Length:501 Length:501 Min. :7.022e+09 Min. : 10000
## Class :character Class :character 1st Qu.:8.995e+09 1st Qu.: 264745
## Mode :character Mode :character Median :9.420e+09 Median : 374900
## Mean :9.193e+09 Mean : 510669
## 3rd Qu.:9.798e+09 3rd Qu.: 539000
## Max. :9.992e+09 Max. :28950000
## NA's :2
## baths baths_full baths_full_calc beds
## Min. :0.000 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :2.000 Median :2.000 Median :2.000 Median :3.000
## Mean :2.323 Mean :2.333 Mean :2.333 Mean :3.455
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :8.000 Max. :8.000 Max. :8.000 Max. :9.000
## NA's :65 NA's :65 NA's :61
## sqft stories sub_type text
## Min. : 0 Min. :1.000 Length:501 Length:501
## 1st Qu.: 1604 1st Qu.:1.000 Class :character Class :character
## Median : 2034 Median :1.000 Mode :character Mode :character
## Mean : 2335 Mean :1.376
## 3rd Qu.: 2636 3rd Qu.:2.000
## Max. :67139 Max. :4.000
## NA's :63 NA's :110
## type year_built
## Length:501 Min. :1891
## Class :character 1st Qu.:1981
## Mode :character Median :2006
## Mean :2000
## 3rd Qu.:2022
## Max. :2024
## NA's :212
# Extracting the street_address, city, state, and zip
# Defining the regex pattern with capture groups for each value of the columns
url_groups <- "realestateandhomes-detail\\/([^_]+)_([^_]+)_([^_]+)_([0-9]{5})"
# str_match returns matches for each capture group in the url column
str_matches <- str_match(tx_data$url, url_groups)
# Assigning the matched capture groups to new columns in the tx_data
tx_data$street_address <- str_matches[, 2]
tx_data$city <- str_matches[, 3]
tx_data$state <- str_matches[, 4]
tx_data$zip <- str_matches[, 5]
# Replacing the dash delimiter in the street address and city with a single space
tx_data$street_address <- gsub("-", " ", tx_data$street_address)
tx_data$city <- gsub("-", " ", tx_data$city)
# Converting ZIP from chr to int
tx_data$zip <- as.integer(tx_data$zip)
# Removing the url column (not needed anymore) and status column (all statuses are for_sale, redundant)
# tx_data <- select(tx_data, -url, -status)
# Checking results
head(tx_data)
## url
## 1 https://www.realtor.com/realestateandhomes-detail/10410-Daw-Collins-Rd_Cleveland_TX_77328_M97739-41616
## 2 https://www.realtor.com/realestateandhomes-detail/6800-Woodland-Dr_Athens_TX_75752_M92249-23922
## 3 https://www.realtor.com/realestateandhomes-detail/110-County-Road-3456_Hawkins_TX_75765_M98406-61824
## 4 https://www.realtor.com/realestateandhomes-detail/1204-Vera-Ln_Kennedale_TX_76060_M73383-17229
## 5 https://www.realtor.com/realestateandhomes-detail/598-Stagecoach-Trl_Denison_TX_75021_M72858-45528
## 6 https://www.realtor.com/realestateandhomes-detail/5601-Joshua-Ct_Mansfield_TX_76063_M75504-52644
## status id listPrice baths baths_full baths_full_calc beds sqft
## 1 for_sale 9773941616 240000 2 2 2 3 1190
## 2 for_sale 9224923922 379900 4 3 3 4 2033
## 3 for_sale 9840661824 370000 2 2 2 4 2062
## 4 for_sale 7338317229 444000 4 3 3 5 3705
## 5 for_sale 7285845528 569000 2 2 2 3 3282
## 6 for_sale 7550452644 875000 5 3 3 4 4873
## stories sub_type
## 1 1
## 2 1
## 3 1
## 4 2
## 5 2
## 6 2
## text
## 1 Welcome home to your peaceful retreat nestled on 2 acres of beautiful land! This charming home offers the perfect blend of comfort and serenity. Step inside to discover a cozy living space where relaxation meets functionality. The open layout seamlessly connects the living room, dining area, and kitchen, creating an inviting atmosphere for everyday living and entertaining. Outside, you'll find a spacious yard with plenty of room to roam and explore. Plus, the property includes a handy storage shed with electricity, and insulation perfect for storing tools, equipment, or creating your own workshop.Located in a peaceful setting, yet just a short drive from amenities and attractions, this property offers the best of both worlds rural tranquility and suburban convenience. Don't miss your chance to make this delightful home yours! Schedule a showing today and start living the good life. ??
## 2 Beautiful country home on 0.85 fenced acres, minutes from Athens' amenities. Sit in the swing on this large front porch, of this recently updated home, and enjoy the peace & quietness of the neighborhood. This beautiful 4 bed, 3.5 bath home has an open dining/kitchen area with granite counters & bar seating. The WBFP is the center of attention in the spacious living room. New Aerobic system 2022, 2 updated bathrooms as well new carpet. The 4th bedroom has its own ensuite & a sitting area. This could be utilized as an office. The 2-car garage & 4 car carport have all your vehicles covered, what more could you want! A workshop with plenty of storage and a 1st class chicken coup in the spacious fenced back yard is perfect. Buyer to obtain own survey. Call the Listing Agent for your personal tour.
## 3 PRICED TO SELL CORNER LOT HAS A STORM SHELTER IN GARAGE
## 4 Come check out country living in the city! Are you looking for a large family home? This is the one for you! This 5 BR, 3.5 bath, 2 large living areas, office, flex room currently being used as a 6th bedroom boasts 3705sqf on just under half an acre on a secluded cul-de-sac in Kennedale ISD. This street is one of the most coveted streets with the best neighbors in Kennedale. Features include: 2 huge living areas downstairs plus an office & oversized primary BR suite. Updates include: fresh paint on outside of home & in garage, replaced water heaters, laminate flooring in primary BR, updated shower, updated second full bath. Kitchen has that farmhouse charm, granite counter tops, & windows that overlook backyard acreage plus greenbelt behind it for all the country feel. Space for chickens & goats in fenced pens, a shed, garden, and a dog run. If you are looking for a large home with plenty of space for your family and friends to host all year long, this is the one for you. Welcome Home!
## 5 Welcome to your dream retreat! Nestled on over 7 acres of secluded land adorned with towering mature oaks, this massively well-renovated home offers a truly unique living experience. With over 3200 square feet of living space, including three bedrooms and two baths, this property is your own private oasis. Imagine starting your mornings on the balcony sipping coffee while watching the wildlife, and ending your days by curling up to watch the sunset over the picturesque landscape. This property is larger than life and awaits you to turn your home dreams into reality.
## 6 Exquisite custom home nestled among mature trees on 2.5 acre serene setting. Corner lot for added privacy tucked away on secluded cul de sac with only 10 homes yet close to all conveniences. Quality craftsmanship with extensive wood moulding & nail down hardwood flooring. Family room has cozy fireplace flanked by built-ins with wall of windows allowing natural light & tranquil views. Kitchen offers quartzite counters, stainless appliances, built-in refrigerator, double ovens, 2 sinks & walk-in pantry. Primary bedroom down with patio access. Primary bath includes jetted tub, steam shower & walk-in closet with cedar closet. Separate 600 bottle cooled wine room! Upstairs has secondary bedrooms with Jack-n-Jill bath, HUGE game room & 4th living space! Tranquil backyard with flagstone covered patio surrounded by lush landscape leads to additional seating area with firepit overlooking expansive backyard & naturally wooded area. Oversized 3 car garage includes workshop! No HOA or city tax.
## type year_built street_address city state zip
## 1 single_family 2018 10410 Daw Collins Rd Cleveland TX 77328
## 2 single_family 2002 6800 Woodland Dr Athens TX 75752
## 3 single_family 2012 110 County Road 3456 Hawkins TX 75765
## 4 single_family 1985 1204 Vera Ln Kennedale TX 76060
## 5 single_family 1981 598 Stagecoach Trl Denison TX 75021
## 6 single_family 1999 5601 Joshua Ct Mansfield TX 76063
# numeric vector that calculates the sum of NAs per column in the tx_data data.
nas_per_column<- colSums(is.na(tx_data))
print(nas_per_column)
## url status id listPrice baths
## 0 0 0 2 0
## baths_full baths_full_calc beds sqft stories
## 65 65 61 63 110
## sub_type text type year_built street_address
## 0 0 0 212 1
## city state zip
## 1 1 1
Findings: id: 9318998668 does not have a street address. Therefore, its entries are NA. We’ll discuss dropping this row.
# View(tx_data)
glimpse(tx_data)
## Rows: 501
## Columns: 18
## $ url <chr> "https://www.realtor.com/realestateandhomes-detail/104…
## $ status <chr> "for_sale", "for_sale", "for_sale", "for_sale", "for_s…
## $ id <dbl> 9773941616, 9224923922, 9840661824, 7338317229, 728584…
## $ listPrice <int> 240000, 379900, 370000, 444000, 569000, 875000, 214500…
## $ baths <int> 2, 4, 2, 4, 2, 5, 2, 3, 2, 2, 2, 6, 2, 3, 3, 2, 0, 2, …
## $ baths_full <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ baths_full_calc <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ beds <int> 3, 4, 4, 5, 3, 4, 4, 5, 4, 3, 3, 6, 3, 3, 4, 4, NA, 3,…
## $ sqft <int> 1190, 2033, 2062, 3705, 3282, 4873, 2260, 2109, 1896, …
## $ stories <int> 1, 1, 1, 2, 2, 2, NA, 1, 1, 1, NA, 2, 1, 1, 2, 1, NA, …
## $ sub_type <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ text <chr> "Welcome home to your peaceful retreat nestled on 2 ac…
## $ type <chr> "single_family", "single_family", "single_family", "si…
## $ year_built <int> 2018, 2002, 2012, 1985, 1981, 1999, 2020, 1956, 2000, …
## $ street_address <chr> "10410 Daw Collins Rd", "6800 Woodland Dr", "110 Count…
## $ city <chr> "Cleveland", "Athens", "Hawkins", "Kennedale", "Deniso…
## $ state <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
## $ zip <int> 77328, 75752, 75765, 76060, 75021, 76063, 75755, 76240…
tx_data_no_land_farm <- filter(tx_data, type != "land" & type != "farm")
#print rows with na
rows_with_na <- subset(tx_data_no_land_farm, apply(is.na(tx_data_no_land_farm), 1, any))
# view(rows_with_na)
Rows that are type “land” and “farm” have NAs since no house is built
These values are suppose to be zero no house in the property
#baths
tx_data$baths[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths)] <- 0
#baths_full
tx_data$baths_full[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths_full)] <- 0
#baths_full_calc
tx_data$baths_full_calc[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths_full_calc)] <- 0
#beds
tx_data$beds[tx_data$type %in% c("land", "farm") & is.na(tx_data$beds)] <- 0
#stories
tx_data$stories[tx_data$type %in% c("land", "farm") & is.na(tx_data$stories)] <- 0
#sqft
tx_data$sqft[tx_data$type %in% c("land", "farm") & is.na(tx_data$sqft)] <- 0
#year_built
tx_data$year_built[tx_data$type %in% c("land", "farm") & is.na(tx_data$year_built)] <- 0
Since these records have too many NAs it will be hard to predict what was the missing value, so they will dropped. I also dropped a record with no location data
tx_data <- filter(tx_data, id != 9031060769)
tx_data <- filter(tx_data, id != 9530697722)
tx_data <- filter(tx_data, id != 9634827807)
tx_data <- filter(tx_data, id != 9318998668)
Trying to find clusters to see if KNN imputation will work. Based on the graph, you can see, houses with the same stories are clustering based on bath and sqft.
#create temp df
temp <- tx_data
#removing outliers
temp <- filter(temp, id != 9887837817) #remove listPrice outlier
temp <- filter(temp, id != 9697989595) #remove sqft outlier
#filter land and farms, since we know these always have 0 in sqft, bath, & stories
temp <- filter(temp, !(type %in% c("land", "farm") ))
# Get unique values of 'stories' excluding NA
story_values <- na.omit(unique(temp$stories))
# Define a color palette with enough colors for each unique 'stories' value
color_palette <- colorRampPalette(brewer.pal(5, "Set2"))(length(story_values))
# Create the beeswarm plot
ggplot(temp, aes(x = baths, y = sqft, color = factor(stories))) +
geom_beeswarm(cex = 1.2, show.legend = TRUE) +
scale_color_manual(values = c(color_palette),
na.value = "blue",
guide = guide_legend(title = "stories")) +
theme_minimal() +
scale_x_continuous(breaks = unique(temp$baths), labels = unique(temp$baths)) +
labs(title = "Sqft vs Baths",
x = "# of Baths",
y = "Sqft",
color = '# of Floors') +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 12))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
temp <- tx_data
temp <- subset(temp, select = c(baths, baths_full, baths_full_calc, beds, sqft, stories))
stories <- subset(temp, select = stories)
preProcValues <- preProcess(temp,
method = c("knnImpute"),
k = 5,
knnSummary = median)
impute_temp <- predict(preProcValues, temp,na.action = na.pass)
#denormalize the returned vals
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
impute_temp[i] <- impute_temp[i]*preProcValues$std[i]+preProcValues$mean[i]
}
# view(impute_temp)
Comparing both before and after KNN imputation, seems like the KNN imputation did not output any extraordinary values. Most NAs in “stories” got replaced by most frequent “stories” value in clusters seen in the graph.
impute_temp$type <- tx_data$type
temp <- impute_temp %>%
filter(sqft != 67139) %>% # Remove sqft outlier
filter(!(type %in% c("land", "farm") )) #filter for same reason as before
# Create the beeswarm plot
ggplot(temp, aes(x = baths, y = sqft, color = factor(stories))) +
geom_beeswarm(cex = 1.2, show.legend = TRUE) +
scale_color_manual(values = c(color_palette),
na.value = "blue",
guide = guide_legend(title = "stories")) +
theme_minimal() +
scale_x_continuous(breaks = unique(temp$baths), labels = unique(temp$baths)) +
labs(title = "Sqft vs Baths",
x = "# of Baths",
y = "Sqft",
color = '# of Floors') +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 12))
#transfer imputed values to original df
tx_data$stories <- impute_temp$stories
tx_data$sqft <- impute_temp$sqft #there is only one NA for sqft
#year_built
tx_data$year_built[is.na(tx_data$year_built)] <- 1
Fill NA with 1, to signify unknown
#print rows with na
rows_with_na <- subset(tx_data, apply(is.na(tx_data), 1, any))
# View(rows_with_na)
# Regex to find acre values in the text column
regex_acres <- "\\d+\\.?\\d*\\s*[-+]?/?\\s*(acre|acres|AC|Acre|acre)"
tx_data <- tx_data %>%
mutate(
# Extracting the acres from the text column
extract_acres = str_extract(text, regex_acres),
# Formatting string to be used for calculation
clean_acres = gsub("[^0-9.]", "", extract_acres), # Removing the non-numeric characters
value_acres = as.numeric(clean_acres),
# Converting acres to sqft if it is 0 and value_acres is not NA
sqft = ifelse(sqft == 0 & !is.na(value_acres), value_acres * 43560, sqft)) %>%
select(-extract_acres, -clean_acres, -value_acres) # Removing temporary columns
# Writing updated tx_data to csv to share with team
# write.csv(tx_data, file = create_file, row.names = FALSE)
temp <- tx_data %>%
filter(type == "single_family") %>%
group_by(city) %>%
summarise(pricebysqft = mean(listPrice/(sqft+1))) #shift data by one for zero vals in sqft
temp %>%
top_n(10, pricebysqft) %>%
mutate(city_color = ifelse(city == "Austin", "Austin", "Other")) %>%
ggplot() +
geom_bar(aes(x = pricebysqft,
y = reorder(city,+pricebysqft),
fill = city_color),
stat = "identity") +
scale_fill_manual(values = c("Other" = "darkgrey", "Austin" = "red")) +
labs(title = "Top 10 Most Expensive Cities For Single Family Homes by Sqft",
x = "Average Price Per Sqft ($)",
y = "City") +
theme_minimal() +
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 16)) +
guides(fill = "none")
#ggsave("Top10_Expensive_SFH.png", width = 10, height = 6, dpi = 300)
# Creating scatterplots of Square Footage vs. List Price by Property Type
# Filtering out data where sqft != 0
filtered_sqft <- tx_data %>%
filter(sqft != 0)
ggplot(filtered_sqft, aes(x = listPrice, y = sqft)) +
geom_point(aes(color = type), alpha = 0.7) +
facet_wrap(~ type, scales = "free") + # Creating a separate plot for each property type
scale_y_log10() + # Applying log 10
labs(title = "Square Footage vs. List Price by Property Type",
x = "List Price",
y = "Square Footage (Log Scale 10)") +
theme_bw() + # Using bw theme
theme(plot.title = element_text(hjust = 0.5), # Centering the plot titles
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
legend.background = element_rect(fill = "white",
color = "black", size = 1, linetype = "solid"))
# ggsave("sqft_vs_price.png", width = 10, height = 6, dpi = 300)
#Potentially use for Milestone 3/4
# Ordering data by count descending
tx_data$type <- factor(tx_data$type, levels = names(sort(table(tx_data$type), decreasing = TRUE)))
# Define appropriate labels for the x-axis
x_labels <- c("single_family" = "Single Family Homes", "land" = "Land", "farm" = "Farm", "mobile" = "Mobile Homes", "townhomes" = "Town Homes", "condos" = "Condos")
# Define colors for each property type
type_colors <- c("single_family" = "lightpink1", "land" = "tan4", "farm" = "olivedrab3", "mobile" = "cadetblue2", "townhomes" = "cornflowerblue", "condos" = "plum3")
# Bar chart with count by property type
ggplot(data = tx_data, aes(x = type, fill = type)) +
geom_bar(stat = "count", color = "black") +
geom_text(stat = 'count', aes(label = ..count..), vjust = -1) +
labs(title = "Count of Properties by Type in Texas Real Estate Listings",
x = "Property Type",
y = "Count") +
scale_fill_manual(values = type_colors) + # Setting custom colors
scale_y_continuous(limits = c(0, max(table(tx_data$type)) * 1.1)) + # Extending y limit to prevent geom_text cutoff
scale_x_discrete(labels = x_labels) + # Rename x labels
theme_linedraw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 20, hjust = 0.5),
axis.title = element_text(size = 16),
panel.grid.major = element_blank(), # Removing gridlines
panel.grid.minor = element_blank())
# ggsave("count_bar_plot.png", width = 10, height = 6, dpi = 300)
# Filtering for single_family type, this will be for multiple visualizations
single_family_homes <- tx_data[tx_data$type == "single_family",]
# Defining bin width at 200
binwidth <- 200
# Calculating the scaling factor for the KDE curve
scaling_factor <- binwidth * nrow(single_family_homes)
# Plotting the histogram with density line
ggplot(single_family_homes, aes(x = sqft)) +
geom_histogram(binwidth = binwidth, color = "black", fill = "skyblue") +
# Smoothing the density line using adjust = 2.0
geom_density(aes(y = ..density.. * scaling_factor), color = "red2", fill = "red2", alpha = 0.5, adjust = 2.0) +
labs(title = "Distribution of Square Footage for Single Family Homes",
x = "Square Footage",
y = "Number of Listings") +
scale_x_continuous(breaks = seq(0, 7000, by = 1000)) +
scale_y_continuous(breaks = seq(0, 55, by = 5)) +
theme_linedraw() +
theme(axis.text.x = element_text(size = 13),
axis.text.y = element_text(size = 13),
plot.title = element_text(size = 20, hjust = 0.5),
axis.title = element_text(size = 16),
panel.grid.major = element_blank(), # Removing gridlines
panel.grid.minor = element_blank())
# ggsave("sqft_dist.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$sqft)
## [1] 2226.75
# Defining bin width for the histogram
binwidth <- 50000
# Calculating the scaling factor for the KDE curve
scaling_factor <- binwidth * nrow(single_family_homes)
# Plotting the histogram with density line for the listing prices of single-family homes
ggplot(single_family_homes, aes(x = listPrice)) +
geom_histogram(binwidth = binwidth, color = "black", fill = "darkolivegreen3") +
geom_density(aes(y = ..density.. * scaling_factor), color = "red2", fill = "red2", alpha = 0.5, adjust = 2.0) +
labs(title = "Distribution of Listing Price for Single Family Homes",
x = "Listing Price ($)",
y = "Number of Listings") +
scale_x_continuous(labels = scales::comma,
breaks = seq(0, max(single_family_homes$listPrice), by = 250000)) + # More x-axis breaks
theme_linedraw() +
theme(axis.text.x = element_text(size = 13, angle = 45, hjust = 1),
axis.text.y = element_text(size = 13),
plot.title = element_text(size = 20, hjust = 0.5),
axis.title = element_text(size = 16),
panel.grid.major = element_blank(), # Removing gridlines
panel.grid.minor = element_blank())
# ggsave("price_dist.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$listPrice)
## [1] 457608.3
# Scatter plot for Bathrooms vs. Bedrooms with jitter
ggplot(single_family_homes, aes(x = beds, y = baths)) +
geom_jitter(aes(color = factor(beds)), width = 0.3, height = 0.3, alpha = 0.7) +
labs(title = "Bathrooms vs. Bedrooms for Single Family Homes in Texas",
x = "Count of Bedrooms",
y = "Count of Bathrooms") +
scale_color_brewer(palette = "Dark2") +
scale_x_continuous(breaks = seq(0, max(single_family_homes$beds), by = 1)) +
scale_y_continuous(breaks = seq(0, max(single_family_homes$baths), by = 1)) +
theme_minimal() +
theme(legend.position = "none",
legend.background = element_rect(fill = "white",
color = "black", size = 1, linetype = "solid"),
axis.line = element_line(color = "black"))
# ggsave("bed_vs_bath_scatter_plot.png", width = 10, height = 6, dpi = 300)
# Revisiting the Bathrooms vs. Bedrooms visualization with a heatmap
# Creating sub table to group the data by number of beds and baths.
# Then summarizes each group by the count column for # of records n()
heatmap_data <- single_family_homes %>%
group_by(beds, baths) %>%
summarise(count = n(), .groups = "drop") # preventing grouped dataframe
# Checking result
# View(heatmap_data)
# Plotting heatmap of summarized data
ggplot(heatmap_data, aes(x = factor(beds), y = factor(baths), fill = count)) +
geom_tile() +
scale_fill_gradientn(colors = brewer.pal(9, "YlGnBu")) + # Using YlGnBu palette
labs(title = "Heatmap of Bathrooms vs. Bedrooms for Single Family Homes",
x = "Bedrooms",
y = "Bathrooms",
fill = "Count of Single Family Homes") +
theme_linedraw() +
theme(legend.position = "bottom",
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 20, hjust = 0.5),
axis.title = element_text(size = 16),
panel.grid.major = element_blank(), # Removing gridlines
panel.grid.minor = element_blank())
# ggsave("bed_vs_bath_heatmap.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$beds)
## [1] 3.53
mean(single_family_homes$baths)
## [1] 2.6875
# Filtering out data where year_built != 1, year_built != 0
filtered_data_mosaic <- tx_data %>%
filter(year_built != 1, year_built != 0)
# Categorizing 5 periods for year_built
filtered_data_mosaic$year_built_period <- cut(filtered_data_mosaic$year_built,
breaks = c(-Inf, 1899, 1945, 1999, 2010, Inf),
labels = c("Before 1900 (Historic)", "1900-1945 (Early 20th Cent.)", "1946-1999 (Post-WWII-Late 20th Cent.)", "2000-2010 (Early 21st Cent.)", "2011-2024 (Modern)"))
# Getting all the colors from the "RdPu" palette
all_colors_RdPu <- brewer.pal(9, "RdPu")
# Skipping the first 3 colors of the palette
colors_to_use_RdPu <- all_colors_RdPu[-(1:3)]
# Mosaic Plot of Property Type vs. Year Built Period
ggplot(data = filtered_data_mosaic) +
geom_mosaic(aes(weight = 1, x = product(type), fill = year_built_period), na.rm = TRUE) +
labs(title = "Mosaic Plot - Property Type vs. Year Built Period", x = "Property Type", y = "Year Built Period") +
scale_fill_manual(values = colors_to_use_RdPu) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_blank(),
legend.position = "right",
legend.background = element_rect(fill = "white",
color = "black", size = 1, linetype = "solid")) +
guides(fill = guide_legend(title = "Year Built Period"))
# ggsave("mosaic.png", width = 10, height = 6, dpi = 300)