Color resources: https://sape.inf.usi.ch/quick-reference/ggplot2/colour https://r-graph-gallery.com/38-rcolorbrewers-palettes.html https://ggplot2.tidyverse.org/reference/ggtheme.html https://r-graph-gallery.com/79-levelplot-with-ggplot2.html

Constants

dataset_file <- "real_estate_texas_500_2024.csv"
#create_file <- "real_estate_texas_500_2024_ADDRESS.csv"

Libraries

library(tidyverse)
library(mosaic)
library(lubridate)
library(reshape)
library(reshape2)
library(gcookbook)
library(scales)
library(mapproj)
library(zoo)
library(gridExtra)
library(RColorBrewer)


library(datasets)
library(ggforce)
library(ggbeeswarm)
library(ggmosaic)

library(magrittr)
library(scales)
library(tidyquant)
library(readr)
library(ggpubr)
library(stringr)

library(tinytex)
library(rmarkdown)

library(fastDummies)
library(caret)
library(dplyr)
library(RANN)

Discovery

tx_data <- read.csv(dataset_file)
glimpse(tx_data)
## Rows: 501
## Columns: 14
## $ url             <chr> "https://www.realtor.com/realestateandhomes-detail/104…
## $ status          <chr> "for_sale", "for_sale", "for_sale", "for_sale", "for_s…
## $ id              <dbl> 9773941616, 9224923922, 9840661824, 7338317229, 728584…
## $ listPrice       <int> 240000, 379900, 370000, 444000, 569000, 875000, 214500…
## $ baths           <int> 2, 4, 2, 4, 2, 5, 2, 3, 2, 2, 2, 6, 2, 3, 3, 2, 0, 2, …
## $ baths_full      <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ baths_full_calc <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ beds            <int> 3, 4, 4, 5, 3, 4, 4, 5, 4, 3, 3, 6, 3, 3, 4, 4, NA, 3,…
## $ sqft            <int> 1190, 2033, 2062, 3705, 3282, 4873, 2260, 2109, 1896, …
## $ stories         <int> 1, 1, 1, 2, 2, 2, NA, 1, 1, 1, NA, 2, 1, 1, 2, 1, NA, …
## $ sub_type        <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ text            <chr> "Welcome home to your peaceful retreat nestled on 2 ac…
## $ type            <chr> "single_family", "single_family", "single_family", "si…
## $ year_built      <int> 2018, 2002, 2012, 1985, 1981, 1999, 2020, 1956, 2000, …
# Checking count of all NAs in the tx_data

sum(is.na(tx_data))
## [1] 578
summary(tx_data)
##      url               status                id              listPrice       
##  Length:501         Length:501         Min.   :7.022e+09   Min.   :   10000  
##  Class :character   Class :character   1st Qu.:8.995e+09   1st Qu.:  264745  
##  Mode  :character   Mode  :character   Median :9.420e+09   Median :  374900  
##                                        Mean   :9.193e+09   Mean   :  510669  
##                                        3rd Qu.:9.798e+09   3rd Qu.:  539000  
##                                        Max.   :9.992e+09   Max.   :28950000  
##                                                            NA's   :2         
##      baths         baths_full    baths_full_calc      beds      
##  Min.   :0.000   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :2.000   Median :2.000   Median :2.000   Median :3.000  
##  Mean   :2.323   Mean   :2.333   Mean   :2.333   Mean   :3.455  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :8.000   Max.   :8.000   Max.   :8.000   Max.   :9.000  
##                  NA's   :65      NA's   :65      NA's   :61     
##       sqft          stories        sub_type             text          
##  Min.   :    0   Min.   :1.000   Length:501         Length:501        
##  1st Qu.: 1604   1st Qu.:1.000   Class :character   Class :character  
##  Median : 2034   Median :1.000   Mode  :character   Mode  :character  
##  Mean   : 2335   Mean   :1.376                                        
##  3rd Qu.: 2636   3rd Qu.:2.000                                        
##  Max.   :67139   Max.   :4.000                                        
##  NA's   :63      NA's   :110                                          
##      type             year_built  
##  Length:501         Min.   :1891  
##  Class :character   1st Qu.:1981  
##  Mode  :character   Median :2006  
##                     Mean   :2000  
##                     3rd Qu.:2022  
##                     Max.   :2024  
##                     NA's   :212

Ken Vellian: Creating Address columns through Regular Expressions

# Extracting the street_address, city, state, and zip

# Defining the regex pattern with capture groups for each value of the columns
url_groups <- "realestateandhomes-detail\\/([^_]+)_([^_]+)_([^_]+)_([0-9]{5})"

# str_match returns matches for each capture group in the url column
str_matches <- str_match(tx_data$url, url_groups)

# Assigning the matched capture groups to new columns in the tx_data
tx_data$street_address <- str_matches[, 2]
tx_data$city <- str_matches[, 3]
tx_data$state <- str_matches[, 4]
tx_data$zip <- str_matches[, 5]

# Replacing the dash delimiter in the street address and city with a single space
tx_data$street_address <- gsub("-", " ", tx_data$street_address)
tx_data$city <- gsub("-", " ", tx_data$city)

# Converting ZIP from chr to int
tx_data$zip <- as.integer(tx_data$zip)

# Removing the url column (not needed anymore) and status column (all statuses are for_sale, redundant)
# tx_data <- select(tx_data, -url, -status)

# Checking results
head(tx_data)
##                                                                                                      url
## 1 https://www.realtor.com/realestateandhomes-detail/10410-Daw-Collins-Rd_Cleveland_TX_77328_M97739-41616
## 2        https://www.realtor.com/realestateandhomes-detail/6800-Woodland-Dr_Athens_TX_75752_M92249-23922
## 3   https://www.realtor.com/realestateandhomes-detail/110-County-Road-3456_Hawkins_TX_75765_M98406-61824
## 4         https://www.realtor.com/realestateandhomes-detail/1204-Vera-Ln_Kennedale_TX_76060_M73383-17229
## 5     https://www.realtor.com/realestateandhomes-detail/598-Stagecoach-Trl_Denison_TX_75021_M72858-45528
## 6       https://www.realtor.com/realestateandhomes-detail/5601-Joshua-Ct_Mansfield_TX_76063_M75504-52644
##     status         id listPrice baths baths_full baths_full_calc beds sqft
## 1 for_sale 9773941616    240000     2          2               2    3 1190
## 2 for_sale 9224923922    379900     4          3               3    4 2033
## 3 for_sale 9840661824    370000     2          2               2    4 2062
## 4 for_sale 7338317229    444000     4          3               3    5 3705
## 5 for_sale 7285845528    569000     2          2               2    3 3282
## 6 for_sale 7550452644    875000     5          3               3    4 4873
##   stories sub_type
## 1       1         
## 2       1         
## 3       1         
## 4       2         
## 5       2         
## 6       2         
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       text
## 1                                                                                                        Welcome home to your peaceful retreat nestled on 2 acres of beautiful land! This charming home offers the perfect blend of comfort and serenity. Step inside to discover a cozy living space where relaxation meets functionality. The open layout seamlessly connects the living room, dining area, and kitchen, creating an inviting atmosphere for everyday living and entertaining. Outside, you'll find a spacious yard with plenty of room to roam and explore. Plus, the property includes a handy storage shed with electricity, and insulation perfect for storing tools, equipment, or creating your own workshop.Located in a peaceful setting, yet just a short drive from amenities and attractions, this property offers the best of both worlds rural tranquility and suburban convenience. Don't miss your chance to make this delightful home yours! Schedule a showing today and start living the good life. ??
## 2                                                                                                                                                                                                     Beautiful country home on 0.85 fenced acres, minutes from Athens' amenities. Sit in the swing on this large front porch, of this recently updated home, and enjoy the peace & quietness of the neighborhood. This beautiful 4 bed, 3.5 bath home has an open dining/kitchen area with granite counters & bar seating. The WBFP is the center of attention in the spacious living room. New Aerobic system 2022, 2 updated bathrooms as well new carpet. The 4th bedroom has its own ensuite & a sitting area. This could be utilized as an office. The 2-car garage & 4 car carport have all your vehicles covered, what more could you want! A workshop with plenty of storage and a 1st class chicken coup in the spacious fenced back yard is perfect. Buyer to obtain own survey. Call the Listing Agent for your personal tour.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  PRICED TO SELL CORNER LOT HAS A STORM SHELTER IN GARAGE
## 4 Come check out country living in the city! Are you looking for a large family home? This is the one for you! This 5 BR, 3.5 bath, 2 large living areas, office, flex room currently being used as a 6th bedroom boasts 3705sqf on just under half an acre on a secluded cul-de-sac in Kennedale ISD. This street is one of the most coveted streets with the best neighbors in Kennedale. Features include: 2 huge living areas downstairs plus an office & oversized primary BR suite. Updates include: fresh paint on outside of home & in garage, replaced water heaters, laminate flooring in primary BR, updated shower, updated second full bath. Kitchen has that farmhouse charm, granite counter tops, & windows that overlook backyard acreage plus greenbelt behind it for all the country feel. Space for chickens & goats in fenced pens, a shed, garden, and a dog run. If you are looking for a large home with plenty of space for your family and friends to host all year long, this is the one for you. Welcome Home!
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                             Welcome to your dream retreat! Nestled on over 7 acres of secluded land adorned with towering mature oaks, this massively well-renovated home offers a truly unique living experience. With over 3200 square feet of living space, including three bedrooms and two baths, this property is your own private oasis. Imagine starting your mornings on the balcony sipping coffee while watching the wildlife, and ending your days by curling up to watch the sunset over the picturesque landscape. This property is larger than life and awaits you to turn your home dreams into reality.
## 6    Exquisite custom home nestled among mature trees on 2.5 acre serene setting. Corner lot for added privacy tucked away on secluded cul de sac with only 10 homes yet close to all conveniences. Quality craftsmanship with extensive wood moulding & nail down hardwood flooring. Family room has cozy fireplace flanked by built-ins with wall of windows allowing natural light & tranquil views. Kitchen offers quartzite counters, stainless appliances, built-in refrigerator, double ovens, 2 sinks & walk-in pantry. Primary bedroom down with patio access. Primary bath includes jetted tub, steam shower & walk-in closet with cedar closet. Separate 600 bottle cooled wine room! Upstairs has secondary bedrooms with Jack-n-Jill bath, HUGE game room & 4th living space! Tranquil backyard with flagstone covered patio surrounded by lush landscape leads to additional seating area with firepit overlooking expansive backyard & naturally wooded area. Oversized 3 car garage includes workshop! No HOA or city tax.
##            type year_built       street_address      city state   zip
## 1 single_family       2018 10410 Daw Collins Rd Cleveland    TX 77328
## 2 single_family       2002     6800 Woodland Dr    Athens    TX 75752
## 3 single_family       2012 110 County Road 3456   Hawkins    TX 75765
## 4 single_family       1985         1204 Vera Ln Kennedale    TX 76060
## 5 single_family       1981   598 Stagecoach Trl   Denison    TX 75021
## 6 single_family       1999       5601 Joshua Ct Mansfield    TX 76063
# numeric vector that calculates the sum of NAs per column in the tx_data data.
nas_per_column<- colSums(is.na(tx_data))

print(nas_per_column)
##             url          status              id       listPrice           baths 
##               0               0               0               2               0 
##      baths_full baths_full_calc            beds            sqft         stories 
##              65              65              61              63             110 
##        sub_type            text            type      year_built  street_address 
##               0               0               0             212               1 
##            city           state             zip 
##               1               1               1

Findings: id: 9318998668 does not have a street address. Therefore, its entries are NA. We’ll discuss dropping this row.

# View(tx_data)
glimpse(tx_data)
## Rows: 501
## Columns: 18
## $ url             <chr> "https://www.realtor.com/realestateandhomes-detail/104…
## $ status          <chr> "for_sale", "for_sale", "for_sale", "for_sale", "for_s…
## $ id              <dbl> 9773941616, 9224923922, 9840661824, 7338317229, 728584…
## $ listPrice       <int> 240000, 379900, 370000, 444000, 569000, 875000, 214500…
## $ baths           <int> 2, 4, 2, 4, 2, 5, 2, 3, 2, 2, 2, 6, 2, 3, 3, 2, 0, 2, …
## $ baths_full      <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ baths_full_calc <int> 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 5, 2, 2, 3, 2, NA, 2,…
## $ beds            <int> 3, 4, 4, 5, 3, 4, 4, 5, 4, 3, 3, 6, 3, 3, 4, 4, NA, 3,…
## $ sqft            <int> 1190, 2033, 2062, 3705, 3282, 4873, 2260, 2109, 1896, …
## $ stories         <int> 1, 1, 1, 2, 2, 2, NA, 1, 1, 1, NA, 2, 1, 1, 2, 1, NA, …
## $ sub_type        <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ text            <chr> "Welcome home to your peaceful retreat nestled on 2 ac…
## $ type            <chr> "single_family", "single_family", "single_family", "si…
## $ year_built      <int> 2018, 2002, 2012, 1985, 1981, 1999, 2020, 1956, 2000, …
## $ street_address  <chr> "10410 Daw Collins Rd", "6800 Woodland Dr", "110 Count…
## $ city            <chr> "Cleveland", "Athens", "Hawkins", "Kennedale", "Deniso…
## $ state           <chr> "TX", "TX", "TX", "TX", "TX", "TX", "TX", "TX", "TX", …
## $ zip             <int> 77328, 75752, 75765, 76060, 75021, 76063, 75755, 76240…

Francisco Lozano: Filling NAs

Discovery

tx_data_no_land_farm <- filter(tx_data, type != "land" & type != "farm")

#print rows with na
rows_with_na <- subset(tx_data_no_land_farm, apply(is.na(tx_data_no_land_farm), 1, any))
# view(rows_with_na)

Rows that are type “land” and “farm” have NAs since no house is built

Francisco Lozano: Filling NA for Rows with type land or farm

These values are suppose to be zero no house in the property

#baths
tx_data$baths[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths)] <- 0
#baths_full
tx_data$baths_full[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths_full)] <- 0
#baths_full_calc
tx_data$baths_full_calc[tx_data$type %in% c("land", "farm") & is.na(tx_data$baths_full_calc)] <- 0
#beds
tx_data$beds[tx_data$type %in% c("land", "farm") & is.na(tx_data$beds)] <- 0
#stories
tx_data$stories[tx_data$type %in% c("land", "farm") & is.na(tx_data$stories)] <- 0
#sqft
tx_data$sqft[tx_data$type %in% c("land", "farm") & is.na(tx_data$sqft)] <- 0
#year_built
tx_data$year_built[tx_data$type %in% c("land", "farm") & is.na(tx_data$year_built)] <- 0

Francisco Lozano: Dropping Records

Since these records have too many NAs it will be hard to predict what was the missing value, so they will dropped. I also dropped a record with no location data

tx_data <- filter(tx_data, id != 9031060769) 
tx_data <- filter(tx_data, id != 9530697722) 
tx_data <- filter(tx_data, id != 9634827807) 
tx_data <- filter(tx_data, id != 9318998668) 

Francisco Lozano: Filling “stories” & “sqft” with KNN Imputation

Francisco Lozano: Finding Clusters

Trying to find clusters to see if KNN imputation will work. Based on the graph, you can see, houses with the same stories are clustering based on bath and sqft.

#create temp df
temp <- tx_data

#removing outliers
temp <- filter(temp, id != 9887837817) #remove listPrice outlier
temp <- filter(temp, id != 9697989595) #remove sqft outlier

#filter land and farms, since we know these always have 0 in sqft, bath, & stories
temp <- filter(temp, !(type %in% c("land", "farm") ))

# Get unique values of 'stories' excluding NA
story_values <- na.omit(unique(temp$stories))

# Define a color palette with enough colors for each unique 'stories' value
color_palette <- colorRampPalette(brewer.pal(5, "Set2"))(length(story_values))

# Create the beeswarm plot
ggplot(temp, aes(x = baths, y = sqft, color = factor(stories))) + 
  geom_beeswarm(cex = 1.2, show.legend = TRUE) + 
  scale_color_manual(values = c(color_palette),
                     na.value = "blue",
                     guide = guide_legend(title = "stories")) +
  theme_minimal() +
  scale_x_continuous(breaks = unique(temp$baths), labels = unique(temp$baths)) +
  labs(title = "Sqft vs Baths",
       x = "# of Baths",
       y = "Sqft",
       color = '# of Floors') +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(hjust = 1, size = 10),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 16, hjust = 0.5), 
        axis.title = element_text(size = 12)) 
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Francisco Lozano: Performing KNN imputation

temp <- tx_data
temp <- subset(temp, select = c(baths, baths_full, baths_full_calc, beds, sqft, stories))
stories <- subset(temp, select = stories)

preProcValues <- preProcess(temp,
                            method = c("knnImpute"),
                            k = 5,
                            knnSummary = median)
impute_temp <- predict(preProcValues, temp,na.action = na.pass)

#denormalize the returned vals
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
 impute_temp[i] <- impute_temp[i]*preProcValues$std[i]+preProcValues$mean[i] 
}

# view(impute_temp)

Francisco Lozano: Looking at the Clusters again

Comparing both before and after KNN imputation, seems like the KNN imputation did not output any extraordinary values. Most NAs in “stories” got replaced by most frequent “stories” value in clusters seen in the graph.

impute_temp$type <- tx_data$type
temp <- impute_temp %>%
  filter(sqft != 67139) %>%  # Remove sqft outlier
  filter(!(type %in% c("land", "farm") )) #filter for same reason as before

# Create the beeswarm plot
ggplot(temp, aes(x = baths, y = sqft, color = factor(stories))) + 
  geom_beeswarm(cex = 1.2, show.legend = TRUE) + 
  scale_color_manual(values = c(color_palette),
                     na.value = "blue",
                     guide = guide_legend(title = "stories")) +
  theme_minimal() +
  scale_x_continuous(breaks = unique(temp$baths), labels = unique(temp$baths)) +
  labs(title = "Sqft vs Baths",
       x = "# of Baths",
       y = "Sqft",
       color = '# of Floors') +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(hjust = 1, size = 10),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 16, hjust = 0.5), 
        axis.title = element_text(size = 12)) 

Francisco Lozano: Transfer the values

#transfer imputed values to original df
tx_data$stories <- impute_temp$stories
tx_data$sqft <- impute_temp$sqft #there is only one NA for sqft

Francisco Lozano: Filling “year_built”

#year_built
tx_data$year_built[is.na(tx_data$year_built)] <- 1

Fill NA with 1, to signify unknown

Francisco Lozano: Checking for NA again

#print rows with na
rows_with_na <- subset(tx_data, apply(is.na(tx_data), 1, any))
# View(rows_with_na)

Ken Vellian: Fill in missing Sqft Values through Regular Expressions

# Regex to find acre values in the text column 
regex_acres <- "\\d+\\.?\\d*\\s*[-+]?/?\\s*(acre|acres|AC|Acre|acre)"

tx_data <- tx_data %>%
  mutate(
    # Extracting the acres from the text column
    extract_acres = str_extract(text, regex_acres),
    # Formatting string to be used for calculation
    clean_acres = gsub("[^0-9.]", "", extract_acres), # Removing the non-numeric characters
    value_acres = as.numeric(clean_acres),
    # Converting acres to sqft if it is 0 and value_acres is not NA
    sqft = ifelse(sqft == 0 & !is.na(value_acres), value_acres * 43560, sqft)) %>%
  select(-extract_acres, -clean_acres, -value_acres) # Removing temporary columns

Output File

# Writing updated tx_data to csv to share with team
# write.csv(tx_data, file = create_file, row.names = FALSE)

Francisco Lozano’s graphs

Francisco Lozano: Top 10 Most Expensive Cities for SFH

temp <- tx_data %>%
  filter(type == "single_family") %>%
  group_by(city) %>%
  summarise(pricebysqft = mean(listPrice/(sqft+1))) #shift data by one for zero vals in sqft


temp %>%
top_n(10, pricebysqft) %>%
  mutate(city_color = ifelse(city == "Austin", "Austin", "Other")) %>%
  ggplot() +
  geom_bar(aes(x = pricebysqft,
               y = reorder(city,+pricebysqft),
               fill = city_color),
           stat = "identity") +
  scale_fill_manual(values = c("Other" = "darkgrey", "Austin" = "red")) +
  labs(title = "Top 10 Most Expensive Cities For Single Family Homes by Sqft",
       x = "Average Price Per Sqft ($)",
       y = "City") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 16, hjust = 0.5),
        axis.title = element_text(size = 16)) +
  guides(fill = "none")

#ggsave("Top10_Expensive_SFH.png", width = 10, height = 6, dpi = 300)

Ken Vellian’s graphs

Ken Vellian: 1. Milestone 2: Panel Scatterplot for initial exploratory visualization

# Creating scatterplots of Square Footage vs. List Price by Property Type

# Filtering out data where sqft != 0
filtered_sqft <- tx_data %>%
  filter(sqft != 0)

ggplot(filtered_sqft, aes(x = listPrice, y = sqft)) + 
  geom_point(aes(color = type), alpha = 0.7) + 
  facet_wrap(~ type, scales = "free") + # Creating a separate plot for each property type
  scale_y_log10() + # Applying log 10
  labs(title = "Square Footage vs. List Price by Property Type",
       x = "List Price",
       y = "Square Footage (Log Scale 10)") +
  theme_bw() + # Using bw theme
  theme(plot.title = element_text(hjust = 0.5), # Centering the plot titles
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right",
        legend.background = element_rect(fill = "white", 
                                                 color = "black", size = 1, linetype = "solid"))

# ggsave("sqft_vs_price.png", width = 10, height = 6, dpi = 300)

Ken Vellian: 2. Milestone 3,4: Revised Bar plot

#Potentially use for Milestone 3/4

# Ordering data by count descending
tx_data$type <- factor(tx_data$type, levels = names(sort(table(tx_data$type), decreasing = TRUE)))

# Define appropriate labels for the x-axis
x_labels <- c("single_family" = "Single Family Homes", "land" = "Land", "farm" = "Farm", "mobile" = "Mobile Homes", "townhomes" = "Town Homes", "condos" = "Condos")

# Define colors for each property type
type_colors <- c("single_family" = "lightpink1", "land" = "tan4", "farm" = "olivedrab3", "mobile" = "cadetblue2", "townhomes" = "cornflowerblue", "condos" = "plum3")

# Bar chart with count by property type
ggplot(data = tx_data, aes(x = type, fill = type)) +
  geom_bar(stat = "count", color = "black") +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -1) +
  labs(title = "Count of Properties by Type in Texas Real Estate Listings",
       x = "Property Type",
       y = "Count") +
  scale_fill_manual(values = type_colors) + # Setting custom colors
  scale_y_continuous(limits = c(0, max(table(tx_data$type)) * 1.1)) + # Extending y limit to prevent geom_text cutoff
  scale_x_discrete(labels = x_labels) + # Rename x labels
  theme_linedraw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 20, hjust = 0.5), 
        axis.title = element_text(size = 16),
        panel.grid.major = element_blank(), # Removing gridlines
        panel.grid.minor = element_blank())

# ggsave("count_bar_plot.png", width = 10, height = 6, dpi = 300)

Ken Vellian: 3. Milestone 3,4: Revised histogram/KDE curve used

# Filtering for single_family type, this will be for multiple visualizations
single_family_homes <- tx_data[tx_data$type == "single_family",]
# Defining bin width at 200
binwidth <- 200

# Calculating the scaling factor for the KDE curve
scaling_factor <- binwidth * nrow(single_family_homes)

# Plotting the histogram with density line
ggplot(single_family_homes, aes(x = sqft)) +
  geom_histogram(binwidth = binwidth, color = "black", fill = "skyblue") +
  # Smoothing the density line using adjust = 2.0
  geom_density(aes(y = ..density.. * scaling_factor), color = "red2", fill = "red2", alpha = 0.5, adjust = 2.0) +
  labs(title = "Distribution of Square Footage for Single Family Homes",
       x = "Square Footage",
       y = "Number of Listings") +
  scale_x_continuous(breaks = seq(0, 7000, by = 1000)) +
  scale_y_continuous(breaks = seq(0, 55, by = 5)) + 
  theme_linedraw() +
  theme(axis.text.x = element_text(size = 13),
        axis.text.y = element_text(size = 13),
        plot.title = element_text(size = 20, hjust = 0.5), 
        axis.title = element_text(size = 16),
        panel.grid.major = element_blank(), # Removing gridlines
        panel.grid.minor = element_blank())

# ggsave("sqft_dist.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$sqft)
## [1] 2226.75

Ken Vellian: 4. Milestone 3,4: Revised histogram/KDE curve

# Defining bin width for the histogram
binwidth <- 50000

# Calculating the scaling factor for the KDE curve
scaling_factor <- binwidth * nrow(single_family_homes)

# Plotting the histogram with density line for the listing prices of single-family homes
ggplot(single_family_homes, aes(x = listPrice)) +
  geom_histogram(binwidth = binwidth, color = "black", fill = "darkolivegreen3") +
  geom_density(aes(y = ..density.. * scaling_factor), color = "red2", fill = "red2", alpha = 0.5, adjust = 2.0) +
  labs(title = "Distribution of Listing Price for Single Family Homes",
       x = "Listing Price ($)",
       y = "Number of Listings") +
  scale_x_continuous(labels = scales::comma, 
                     breaks = seq(0, max(single_family_homes$listPrice), by = 250000)) + # More x-axis breaks
  theme_linedraw() +
  theme(axis.text.x = element_text(size = 13, angle = 45, hjust = 1),
        axis.text.y = element_text(size = 13),
        plot.title = element_text(size = 20, hjust = 0.5), 
        axis.title = element_text(size = 16),
        panel.grid.major = element_blank(), # Removing gridlines
        panel.grid.minor = element_blank())

# ggsave("price_dist.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$listPrice)
## [1] 457608.3

Ken Vellian: 5. Milestone 3,4: Scatterplot of Baths vs Beds with jitter. Revised into heatmap for final report.

# Scatter plot for Bathrooms vs. Bedrooms with jitter
ggplot(single_family_homes, aes(x = beds, y = baths)) +
  geom_jitter(aes(color = factor(beds)), width = 0.3, height = 0.3, alpha = 0.7) +
  labs(title = "Bathrooms vs. Bedrooms for Single Family Homes in Texas",
       x = "Count of Bedrooms",
       y = "Count of Bathrooms") +
  scale_color_brewer(palette = "Dark2") +
  scale_x_continuous(breaks = seq(0, max(single_family_homes$beds), by = 1)) +
  scale_y_continuous(breaks = seq(0, max(single_family_homes$baths), by = 1)) +
  theme_minimal() +
  theme(legend.position = "none",
        legend.background = element_rect(fill = "white",
        color = "black", size = 1, linetype = "solid"),
        axis.line = element_line(color = "black"))

# ggsave("bed_vs_bath_scatter_plot.png", width = 10, height = 6, dpi = 300)

Ken Vellian: 6. Final report: Heatmap of Baths vs Beds. Used this as a different approach to visualize

# Revisiting the Bathrooms vs. Bedrooms visualization with a heatmap 

# Creating sub table to group the data by number of beds and baths.
# Then summarizes each group by the count column for # of records n()
heatmap_data <- single_family_homes %>%
  group_by(beds, baths) %>%
  summarise(count = n(), .groups = "drop") # preventing grouped dataframe 

# Checking result
# View(heatmap_data)

# Plotting heatmap of summarized data
ggplot(heatmap_data, aes(x = factor(beds), y = factor(baths), fill = count)) +
  geom_tile() +
  scale_fill_gradientn(colors = brewer.pal(9, "YlGnBu")) + # Using YlGnBu palette
  labs(title = "Heatmap of Bathrooms vs. Bedrooms for Single Family Homes",
       x = "Bedrooms", 
       y = "Bathrooms", 
       fill = "Count of Single Family Homes") +
  theme_linedraw() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        plot.title = element_text(size = 20, hjust = 0.5), 
        axis.title = element_text(size = 16),
        panel.grid.major = element_blank(), # Removing gridlines
        panel.grid.minor = element_blank())

# ggsave("bed_vs_bath_heatmap.png", width = 10, height = 6, dpi = 300)
mean(single_family_homes$beds)
## [1] 3.53
mean(single_family_homes$baths)
## [1] 2.6875

Ken Vellian: Experimenting with Mosaic Plots

  1. Create a visualization using one of the techniques from the latter half of the class (after the midterm).
# Filtering out data where year_built != 1, year_built != 0
filtered_data_mosaic <- tx_data %>%
  filter(year_built != 1, year_built != 0)

# Categorizing 5 periods for year_built 
filtered_data_mosaic$year_built_period <- cut(filtered_data_mosaic$year_built,
                                       breaks = c(-Inf, 1899, 1945, 1999, 2010, Inf),
                                       labels = c("Before 1900 (Historic)", "1900-1945 (Early 20th Cent.)", "1946-1999 (Post-WWII-Late 20th Cent.)", "2000-2010 (Early 21st Cent.)", "2011-2024 (Modern)"))
# Getting all the colors from the "RdPu" palette
all_colors_RdPu <- brewer.pal(9, "RdPu")

# Skipping the first 3 colors of the palette
colors_to_use_RdPu <- all_colors_RdPu[-(1:3)]

# Mosaic Plot of Property Type vs. Year Built Period
ggplot(data = filtered_data_mosaic) +
  geom_mosaic(aes(weight = 1, x = product(type), fill = year_built_period), na.rm = TRUE) +
  labs(title = "Mosaic Plot - Property Type vs. Year Built Period", x = "Property Type", y = "Year Built Period") +
  scale_fill_manual(values = colors_to_use_RdPu) +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text.y = element_blank(),
        legend.position = "right",
        legend.background = element_rect(fill = "white", 
                                         color = "black", size = 1, linetype = "solid")) +
  guides(fill = guide_legend(title = "Year Built Period"))

# ggsave("mosaic.png", width = 10, height = 6, dpi = 300)