In [1]:
# Numerical Operations and Data Manipulation
import pandas as pd
import numpy as np
import numpy.linalg as la
from collections import Counter
import math
import random
# Linear Algebra and Matrix Operations
from scipy.sparse import csr_matrix
from scipy.linalg import svd
from scipy.sparse.linalg import svds
# Data Preprocessing and Machine Learning
from sklearn import datasets
from sklearn import metrics
from sklearn.metrics import silhouette_samples, confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, completeness_score, homogeneity_score
from sklearn.feature_extraction import DictVectorizer, text
from sklearn import preprocessing, model_selection, decomposition, feature_selection
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
# Plotting and Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import graphviz
from IPython.display import Image
# Time-Related Functions
import time
# Import statements for PDF files
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# Import statements for Excel files
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
# Miscellaneous
from operator import itemgetter, attrgetter
import importlib
import re
1. Loading and Cleaning dataset¶
In [2]:
# Load the CSV file
file_path = '/Users/kvellian/My Drive/Bizi Projects/Tobacco_Inv_8.28.24_9.11.24.csv'
df = pd.read_csv(file_path)
In [3]:
df.head()
Out[3]:
| BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/11/2024 | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 047995855079 | American Spirit Black | $0.00 | $12.87 | 0.00 | 1 | 7 | 1.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/1... | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 4.799586e+10 | American Spirit Blue | $10.93 | $12.87 | 15.09 | 24.0 | 18.0 | 1 |
| 1 | BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/1... | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 4.799586e+10 | American Spirit Gold | $11.07 | $13.46 | 17.76 | 1.0 | 14.0 | 1 |
| 2 | BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/1... | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 7.161020e+10 | BACKWOOD Smooth (1) | $0.00 | $1.69 | 0.00 | 2.0 | 16.0 | 1 |
| 3 | BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/1... | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | $0.00 | $7.99 | 0.00 | 2.0 | 65.0 | 1 |
| 4 | BIZI MART | Vendor Detail Report\nDate: 08/28/2024 to 09/1... | Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Order | ... | Items Count: | 107 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | $0.00 | $1.59 | 0.00 | 20.0 | 8.0 | 1 |
5 rows × 23 columns
In [4]:
df.info
Out[4]:
<bound method DataFrame.info of BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/11/2024 Item # \
0 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
1 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
2 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
3 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
4 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
.. ... ... ...
101 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
102 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
103 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
104 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
105 BIZI MART Vendor Detail Report\nDate: 08/28/2024 to 09/1... Item #
Item Name Cost Price Profit % Sold On Hand Order ... \
0 Item Name Cost Price Profit % Sold On Hand Order ...
1 Item Name Cost Price Profit % Sold On Hand Order ...
2 Item Name Cost Price Profit % Sold On Hand Order ...
3 Item Name Cost Price Profit % Sold On Hand Order ...
4 Item Name Cost Price Profit % Sold On Hand Order ...
.. ... ... ... ... ... ... ... ...
101 Item Name Cost Price Profit % Sold On Hand Order ...
102 Item Name Cost Price Profit % Sold On Hand Order ...
103 Item Name Cost Price Profit % Sold On Hand Order ...
104 Item Name Cost Price Profit % Sold On Hand Order ...
105 Item Name Cost Price Profit % Sold On Hand Order ...
Items Count: 107 047995855079 American Spirit Black \
0 Items Count: 107 4.799586e+10 American Spirit Blue
1 Items Count: 107 4.799586e+10 American Spirit Gold
2 Items Count: 107 7.161020e+10 BACKWOOD Smooth (1)
3 Items Count: 107 7.161020e+10 BACKWOOD Smooth (5 pk)
4 Items Count: 107 7.161030e+10 BACKWOOD SWEET AROMATIC (1)
.. ... ... ... ...
101 Items Count: 107 8.667410e+05 ZIG ZAG KING
102 Items Count: 107 8.661490e+05 ZIG ZAG WHITE (32 pk)
103 Items Count: 107 7.847621e+11 Zig-Zag Wraps Straight-Up (2pk)
104 Items Count: 107 7.847621e+11 Zig-Zag Wraps Straight-Up (4 pk)
105 Items Count: 107 6.092499e+11 ZYN Chill 6mg
$0.00 $12.87 0.00 1 7 1.1
0 $10.93 $12.87 15.09 24.0 18.0 1
1 $11.07 $13.46 17.76 1.0 14.0 1
2 $0.00 $1.69 0.00 2.0 16.0 1
3 $0.00 $7.99 0.00 2.0 65.0 1
4 $0.00 $1.59 0.00 20.0 8.0 1
.. ... ... ... ... ... ...
101 $0.00 $3.29 0.00 1.0 24.0 1
102 $0.00 $2.69 0.00 2.0 23.0 1
103 $0.79 $1.49 46.85 8.0 49.0 1
104 $1.00 $1.69 40.89 20.0 50.0 1
105 $5.09 $7.59 32.89 8.0 8.0 1
[106 rows x 23 columns]>
In [5]:
df.dtypes
Out[5]:
BIZI MART object
Vendor Detail Report\nDate: 08/28/2024 to 09/11/2024 object
Item # object
Item Name object
Cost object
Price object
Profit % object
Sold object
On Hand object
Order object
Vendor: object
TOBACCO {PITCO} object
Unnamed: 12 float64
Items Count: object
107 int64
047995855079 float64
American Spirit Black object
$0.00 object
$12.87 object
0.00 float64
1 float64
7 float64
1.1 int64
dtype: object
In [6]:
# Drop the first 15 columns and the 23rd column
df_cleaned = df.drop(df.columns[0:15], axis=1) # Drop columns 1 to 15
df_cleaned = df_cleaned.drop(df.columns[-1], axis=1) # Drop the 23rd column
In [7]:
# Drop any rows with blank entries (NAs)
df_cleaned = df_cleaned.dropna()
In [8]:
# Reconstruct the column names for columns 16 - 22
df_cleaned.columns = ['Item #', 'Item Name', 'Cost', 'Price', 'Profit %', 'Sold', 'On Hand']
In [9]:
df_cleaned.head()
Out[9]:
| Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | |
|---|---|---|---|---|---|---|---|
| 0 | 4.799586e+10 | American Spirit Blue | $10.93 | $12.87 | 15.09 | 24.0 | 18.0 |
| 1 | 4.799586e+10 | American Spirit Gold | $11.07 | $13.46 | 17.76 | 1.0 | 14.0 |
| 2 | 7.161020e+10 | BACKWOOD Smooth (1) | $0.00 | $1.69 | 0.00 | 2.0 | 16.0 |
| 3 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | $0.00 | $7.99 | 0.00 | 2.0 | 65.0 |
| 4 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | $0.00 | $1.59 | 0.00 | 20.0 | 8.0 |
In [10]:
df_cleaned.info
Out[10]:
<bound method DataFrame.info of Item # Item Name Cost Price Profit % \
0 4.799586e+10 American Spirit Blue $10.93 $12.87 15.09
1 4.799586e+10 American Spirit Gold $11.07 $13.46 17.76
2 7.161020e+10 BACKWOOD Smooth (1) $0.00 $1.69 0.00
3 7.161020e+10 BACKWOOD Smooth (5 pk) $0.00 $7.99 0.00
4 7.161030e+10 BACKWOOD SWEET AROMATIC (1) $0.00 $1.59 0.00
.. ... ... ... ... ...
101 8.667410e+05 ZIG ZAG KING $0.00 $3.29 0.00
102 8.661490e+05 ZIG ZAG WHITE (32 pk) $0.00 $2.69 0.00
103 7.847621e+11 Zig-Zag Wraps Straight-Up (2pk) $0.79 $1.49 46.85
104 7.847621e+11 Zig-Zag Wraps Straight-Up (4 pk) $1.00 $1.69 40.89
105 6.092499e+11 ZYN Chill 6mg $5.09 $7.59 32.89
Sold On Hand
0 24.0 18.0
1 1.0 14.0
2 2.0 16.0
3 2.0 65.0
4 20.0 8.0
.. ... ...
101 1.0 24.0
102 2.0 23.0
103 8.0 49.0
104 20.0 50.0
105 8.0 8.0
[104 rows x 7 columns]>
In [11]:
df_cleaned.dtypes
Out[11]:
Item # float64 Item Name object Cost object Price object Profit % float64 Sold float64 On Hand float64 dtype: object
2. Converting "$" columns to float¶
In [12]:
# Convert 'Cost' and 'Price' to numeric values
df_cleaned['Cost'] = df_cleaned['Cost'].replace('[\$,]', '', regex=True).astype(float)
df_cleaned['Price'] = df_cleaned['Price'].replace('[\$,]', '', regex=True).astype(float)
In [13]:
df_cleaned.head()
Out[13]:
| Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | |
|---|---|---|---|---|---|---|---|
| 0 | 4.799586e+10 | American Spirit Blue | 10.93 | 12.87 | 15.09 | 24.0 | 18.0 |
| 1 | 4.799586e+10 | American Spirit Gold | 11.07 | 13.46 | 17.76 | 1.0 | 14.0 |
| 2 | 7.161020e+10 | BACKWOOD Smooth (1) | 0.00 | 1.69 | 0.00 | 2.0 | 16.0 |
| 3 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | 0.00 | 7.99 | 0.00 | 2.0 | 65.0 |
| 4 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | 0.00 | 1.59 | 0.00 | 20.0 | 8.0 |
3. Create "Brand" column and organize products by brand¶
In [14]:
# Step 5: Dictionary for regex patterns for each brand
brand_patterns = {
'American Spirit': r'\bamerican\s*spirit\b',
'Backwoods': r'\bbackwood[s]*\b',
'Black & Mild': r'\bblack[\s&\-]*mild\b',
'BLK Smooth': r'\bblk\b',
'Bugler': r'\bbugler\b',
'Camel': r'\bcamel\b|\bkamel\b',
'Capri': r'\bcapri\b',
'Copenhagen': r'\bcopenhagen\b',
'Crowns': r'\bcrowns\b',
'Crownz': r'\bcrownz\b',
'Dutch': r'\bdutch\b',
'Fiesta': r'\bfiesta\b',
'FLM': r'\bflm\b',
'Fronto Leaf': r'\bfronto\b',
'Good Time': r'\bgood[\s\-]*times?\b', # Combines "Good Times" and "Good Time"
'Grabba': r'\bgrabba\b',
'HIGH HEMP': r'\bhigh[\s\-]*hemp\b',
'Job': r'\bjob\b',
'Kool': r'\bkool\b',
'Longhorn': r'\blonghorn\b',
'Looseleaf': r'\blooseleaf\b',
'Lucky Strike': r'\blucky\s*strike\b',
'Marlboro': r'\bmarlboro\b',
'Maverick': r'\bmaverick\b',
'Newport': r'\bnewport\b',
'Optimo': r'\boptimo\b',
'Pall Mall': r'\bpall\s*mall\b',
'Parliament': r'\bparliament\b',
'Pom Pom': r'\bpom(\s*pom)?\b', # Combines "Pom" and "Pom Pom"
'RAW': r'\braw\b',
'Red River': r'\bred\s*river\b',
'Rogue': r'\brogue\b',
'Seneca': r'\bseneca\b',
'Show': r'\bshow\b',
'Swisher': r'\bswisher\b',
'Winston': r'\bwinston\b',
'XXL': r'\bxxl\b',
'Zemis': r'\bzemis\b',
'Zig-Zag': r'\bzig[\s\-]*zag\b',
'ZYN': r'\bzyn\b'
}
In [15]:
# Step 6: Function to extract brand using regex
def extract_brand_regex(item_name, patterns):
for brand, pattern in patterns.items():
if re.search(pattern, item_name, re.IGNORECASE):
return brand
return 'Other' # Default if no brand matches
In [16]:
# Step 7: Apply the function to create the 'Brand' column with enhanced categorization
df_cleaned['Brand'] = df_cleaned['Item Name'].apply(lambda x: extract_brand_regex(x, brand_patterns))
In [17]:
# Display the first few rows to confirm the changes
df_cleaned.head()
Out[17]:
| Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Brand | |
|---|---|---|---|---|---|---|---|---|
| 0 | 4.799586e+10 | American Spirit Blue | 10.93 | 12.87 | 15.09 | 24.0 | 18.0 | American Spirit |
| 1 | 4.799586e+10 | American Spirit Gold | 11.07 | 13.46 | 17.76 | 1.0 | 14.0 | American Spirit |
| 2 | 7.161020e+10 | BACKWOOD Smooth (1) | 0.00 | 1.69 | 0.00 | 2.0 | 16.0 | Backwoods |
| 3 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | 0.00 | 7.99 | 0.00 | 2.0 | 65.0 | Backwoods |
| 4 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | 0.00 | 1.59 | 0.00 | 20.0 | 8.0 | Backwoods |
In [18]:
# Define regex patterns for categorizing "Wrap" products
wrap_patterns = [
r'\bwrap\b', # Matches 'wrap'
r'\bblunt\b', # Matches 'blunt'
r'\bwraps\b', # Matches 'wraps'
# r'\bleaf\b', # Matches 'leaf'
r'\bwrap\s*kit\b', # Matches 'wrap kit'
# Add more patterns as needed
]
# Function to categorize "Wrap" products using regex
def categorize_wrap(item_name, patterns):
for pattern in patterns:
if re.search(pattern, item_name, re.IGNORECASE):
return 'Wrap'
return None # Return None if no pattern matches
# Apply the function to create the "Type" column for Wrap products
df_cleaned['Type'] = df_cleaned['Item Name'].apply(lambda x: categorize_wrap(x, wrap_patterns))
# Display the first few rows to verify the results
df_cleaned.head()
Out[18]:
| Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Brand | Type | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.799586e+10 | American Spirit Blue | 10.93 | 12.87 | 15.09 | 24.0 | 18.0 | American Spirit | None |
| 1 | 4.799586e+10 | American Spirit Gold | 11.07 | 13.46 | 17.76 | 1.0 | 14.0 | American Spirit | None |
| 2 | 7.161020e+10 | BACKWOOD Smooth (1) | 0.00 | 1.69 | 0.00 | 2.0 | 16.0 | Backwoods | None |
| 3 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | 0.00 | 7.99 | 0.00 | 2.0 | 65.0 | Backwoods | None |
| 4 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | 0.00 | 1.59 | 0.00 | 20.0 | 8.0 | Backwoods | None |
In [19]:
# Dictionaries focusing on non-wrap items
cigarette_brands = {
'Marlboro': 'Cigarette',
'Camel': 'Cigarette',
'Newport': 'Cigarette',
'American Spirit': 'Cigarette',
'Pall Mall': 'Cigarette',
'Winston': 'Cigarette',
'Lucky Strike': 'Cigarette',
'Kool': 'Cigarette',
'Capri': 'Cigarette',
'Maverick': 'Cigarette',
'Seneca': 'Cigarette',
'Crowns': 'Cigarette',
'Parliament': 'Cigarette'
}
cigar_brands = {
'Backwoods': 'Cigar', # Note: Backwoods may also carry wraps, so focus on non-wrap items
'Swisher': 'Cigar', # Note: Swisher may also carry wraps, so focus on non-wrap items
'Dutch': 'Cigar', # Note: Dutch may also carry wraps, so focus on non-wrap items
'Black & Mild': 'Cigar',
'Optimo': 'Cigar',
'Pom Pom': 'Cigar',
'BLK Smooth': 'Cigar',
'Good Time': 'Cigar', # Note: Good Time may also carry wraps, so focus on non-wrap items
'Zemis': 'Cigar',
'Show': 'Cigar', # Note: Show may also carry wraps, so focus on non-wrap items
'Crownz': 'Cigar',
'Fiesta': 'Cigar'
}
chew_brands = {
'Copenhagen': 'Chew',
'ZYN': 'Chew',
'Longhorn': 'Chew',
'Rogue': 'Chew'
}
roll_paper_brands = {
'RAW': 'Roll Paper', # Note: RAW may also carry wraps, so focus on non-wrap items
'Zig-Zag': 'Roll Paper', # Note: Zig-Zag may also carry wraps, so focus on non-wrap items
'Job': 'Roll Paper',
'Red River': 'Roll Paper',
'HIGH HEMP': 'Wrap'
}
other_brands = {
'Bugler': 'Other',
'Fronto Leaf': 'Other',
'Looseleaf': 'Other',
'Grabba': 'Other', # Note: Grabba may also carry wraps, so focus on non-wrap items
'XXL': 'Other', # Note: XXL may also carry wraps, so focus on non-wrap items
'FLM': 'Other',
}
In [20]:
# Function to categorize by dictionary without overwriting existing "Wrap" types
def categorize_by_dict(brand, current_type):
if current_type == 'Wrap':
return current_type # Do not overwrite "Wrap" type
if brand in cigarette_brands:
return cigarette_brands[brand]
elif brand in cigar_brands:
return cigar_brands[brand]
elif brand in chew_brands:
return chew_brands[brand]
elif brand in roll_paper_brands:
return roll_paper_brands[brand]
elif brand in other_brands:
return other_brands[brand]
else:
return 'Other' # Default to 'Other' if no match is found
In [21]:
# Apply the function to fill in the "Type" column for non-Wrap products
df_cleaned['Type'] = df_cleaned.apply(lambda row: categorize_by_dict(row['Brand'], row['Type']), axis=1)
# Display the first few rows to verify the results
df_cleaned.head()
Out[21]:
| Item # | Item Name | Cost | Price | Profit % | Sold | On Hand | Brand | Type | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.799586e+10 | American Spirit Blue | 10.93 | 12.87 | 15.09 | 24.0 | 18.0 | American Spirit | Cigarette |
| 1 | 4.799586e+10 | American Spirit Gold | 11.07 | 13.46 | 17.76 | 1.0 | 14.0 | American Spirit | Cigarette |
| 2 | 7.161020e+10 | BACKWOOD Smooth (1) | 0.00 | 1.69 | 0.00 | 2.0 | 16.0 | Backwoods | Cigar |
| 3 | 7.161020e+10 | BACKWOOD Smooth (5 pk) | 0.00 | 7.99 | 0.00 | 2.0 | 65.0 | Backwoods | Cigar |
| 4 | 7.161030e+10 | BACKWOOD SWEET AROMATIC (1) | 0.00 | 1.59 | 0.00 | 20.0 | 8.0 | Backwoods | Cigar |
Creating Case quantity column.¶
In [22]:
# Updated function to determine case quantity including rules for 'Wrap' type
def determine_case_quantity(item_name, item_type, brand):
# Default case quantity
case_quantity = ''
# Rule for Cigarette type
if item_type == 'Cigarette':
return 10
# Apply rules for Cigar type
if item_type == 'Cigar':
# Regex pattern matching for "(1)", "2 pk", "3 pk", etc.
pattern = r'(\d+)\s*(?:pk|\))'
match = re.search(pattern, item_name, re.IGNORECASE)
if match:
num_units = int(match.group(1))
# Rules for specific brands
if brand in ['Backwoods', 'Zemis']:
return 8 # All cases for these brands return 8
elif brand == 'Black & Mild':
if num_units == 5:
return 10
return 25 # Default or num_units == 1
elif brand == 'BLK Smooth':
if num_units == 2:
return 15
return 25 # Default or num_units == 1
elif brand == 'Dutch':
return 15 # Covers all cases for Dutch
elif brand == 'Good Time':
if num_units == 3:
return 20
return 15 # Default or num_units == 2
elif brand in ['Swisher', 'Optimo']:
return 15 # Covers all cases for these brands
elif brand in ['Pom Pom', 'Show', 'Fiesta']:
return 15 # Covers all cases for these brands
# Default case assignment for specific brands when no regex match is found
if brand == 'Black & Mild':
return 25
elif brand in ['Swisher', 'Dutch', 'BLK Smooth', 'Optimo']:
return 15
# Apply rules only if the item type is 'Wrap'
if item_type == 'Wrap':
# Regex pattern matching for "(1)", "2 pk", "4 pk", etc.
pattern = r'(\d+)\s*(?:pk|\))'
match = re.search(pattern, item_name, re.IGNORECASE)
if match:
num_units = int(match.group(1))
# Rules for specific brands
if brand == 'Backwoods':
if num_units == 5:
return 8
elif brand == 'Good Time':
if num_units == 2:
return 30
return 30 # Default if no pattern matched
elif brand == 'Grabba':
if num_units == 1:
return 8
if num_units == 2:
return 25
elif brand == 'HIGH HEMP':
if num_units == 2:
return 15
return 15 # Default if no pattern matched
elif brand == 'Looseleaf':
if num_units == 5:
return 8
return 15 # Default if no pattern matched
elif brand == 'XXL':
if num_units == 2:
return 15
elif brand == 'Zig-Zag':
if num_units == 2:
return 15
elif num_units == 4:
return 15
# Default case assignment for specific brands when no regex match is found
if brand == 'HIGH HEMP':
return 15
# If no specific rule matches, return default blank
return case_quantity
In [23]:
# Apply the updated function to create a new 'Case' column
df_cleaned['Box'] = df_cleaned.apply(lambda row: determine_case_quantity(row['Item Name'], row['Type'], row['Brand']), axis=1)
Come back to fix Roll Paper and Other type cases¶
Calculate case sold and on hand¶
In [24]:
# Convert the 'Sold', 'On Hand', and 'Box' columns to numeric, coercing errors to NaN
df_cleaned['Sold'] = pd.to_numeric(df_cleaned['Sold'], errors='coerce')
df_cleaned['On Hand'] = pd.to_numeric(df_cleaned['On Hand'], errors='coerce')
df_cleaned['Box'] = pd.to_numeric(df_cleaned['Box'], errors='coerce')
# Recalculate the "Sold by Box" and "On Hand by Box" with the corrected data types
df_cleaned['Sold by Box'] = df_cleaned.apply(
lambda row: f"{int(row['Sold'] // row['Box'])} boxes" +
(f" and {int(row['Sold'] % row['Box'])} pieces" if row['Sold'] % row['Box'] > 0 else "")
if pd.notna(row['Box']) and row['Box'] > 0 else "", axis=1)
df_cleaned['On Hand by Box'] = df_cleaned.apply(
lambda row: f"{int(row['On Hand'] // row['Box'])} boxes" +
(f" and {int(row['On Hand'] % row['Box'])} pieces" if row['On Hand'] % row['Box'] > 0 else "")
if pd.notna(row['Box']) and row['Box'] > 0 else "", axis=1)
Reorganize column order¶
In [25]:
# Reorganize the columns in the specified order
df_cleaned = df_cleaned[['Item #', 'Item Name', 'Brand', 'Type', 'Box', 'Sold by Box', 'On Hand by Box', 'Sold', 'On Hand']]
In [26]:
# Reorganize the dataframe by sorting it based on the 'Type' column in alphabetical order
sorted_data = df_cleaned.sort_values(by=['Type', 'Item Name'], ascending=[True, True]).reset_index(drop=True)
Save the Dataframe as a CSV file¶
In [27]:
# # Save the final DataFrame to a new CSV file
# final_output_path = '/Users/kvellian/My Drive/DePaul University/Bizi Projects/output_by_case.csv'
# sorted_data.to_csv(final_output_path, index=False)
Saving as an Excel file and Color Coding Reorder Point Conditions¶
In [28]:
def save_sorted_data_with_formatting(df, output_path):
"""
Saves the provided DataFrame to an Excel file with conditional formatting,
incorporating "Box" values into reorder point calculations.
Args:
df (pd.DataFrame): The DataFrame to save and format.
output_path (str): The file path to save the Excel file.
"""
# Convert the final DataFrame to an Excel file
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Sheet1')
worksheet = writer.sheets['Sheet1']
# Dictionary that maps color names to their hex codes
color_map = {
"red": "FF0000",
"yellow": "FFFF00",
"green": "00FF00",
"orange": "FFA500"
}
# Use the color_map to assign hex values to the PatternFill
red_fill = PatternFill(start_color=color_map["red"], end_color=color_map["red"], fill_type="solid")
yellow_fill = PatternFill(start_color=color_map["yellow"], end_color=color_map["yellow"], fill_type="solid")
green_fill = PatternFill(start_color=color_map["green"], end_color=color_map["green"], fill_type="solid")
orange_fill = PatternFill(start_color=color_map["orange"], end_color=color_map["orange"], fill_type="solid")
# Determine the number of columns in the DataFrame
max_column = len(df.columns)
# Apply conditional formatting row by row
for idx, row in df.iterrows():
sold = row['Sold']
on_hand = row['On Hand']
box = row['Box']
excel_row = idx + 2 # +2 because Excel rows start at 1 and there is a header row
# Red Condition
if sold > on_hand or on_hand < box:
# Change condition if needed
for cell in worksheet[f'A{excel_row}':f'{chr(64+max_column)}{excel_row}']:
for c in cell:
c.fill = red_fill
# Yellow Condition
elif on_hand <= 1.5 * sold or (box and on_hand <= 1.5 * box and on_hand > box):
# Change condition if needed
for cell in worksheet[f'A{excel_row}':f'{chr(64+max_column)}{excel_row}']:
for c in cell:
c.fill = yellow_fill
# Orange Condition (New)
elif (on_hand <= 2 * sold and on_hand > 1.5 * sold) or (box and on_hand <= 2 * box and on_hand > 1.5 * box):
# Change condition if needed
for cell in worksheet[f'A{excel_row}':f'{chr(64+max_column)}{excel_row}']:
for c in cell:
c.fill = orange_fill
# Green Condition
elif on_hand > 2 * sold and (box is None or on_hand > 2 * box):
# Change condition if needed
for cell in worksheet[f'A{excel_row}':f'{chr(64+max_column)}{excel_row}']:
for c in cell:
c.fill = green_fill
In [29]:
# Save Excel file
excel_color_coded_path = '/Users/kvellian/My Drive/Bizi Projects/Bizi_Tobacco_List_9.11.24.xlsx'
save_sorted_data_with_formatting(sorted_data, excel_color_coded_path)
In [ ]:
Save the CSV file and PDF file¶
In [ ]:
In [30]:
# def save_dataframe_to_pdf_adjusted_with_bold_headers(df, pdf_file_path):
# # Define the maximum number of rows per page
# max_rows_per_page = 20
# # Create a PDF file to save the table
# with PdfPages(pdf_file_path) as pdf:
# for i in range(0, len(df), max_rows_per_page):
# fig, ax = plt.subplots(figsize=(12, 14)) # Adjust size to fit the page
# ax.axis('tight')
# ax.axis('off')
# # Extract the subset of the DataFrame for the current page
# subset_df = df.iloc[i:i + max_rows_per_page]
# # Create the table with text wrapping and bold headers
# table = ax.table(cellText=subset_df.values,
# colLabels=subset_df.columns,
# cellLoc='center',
# loc='center',
# colWidths=[0.15, 0.35, 0.2, 0.12, 0.1, 0.15, 0.15]) # Adjust the column widths
# # Set properties for better text visibility
# table.auto_set_font_size(False)
# table.set_fontsize(7) # Slightly reduce the font size
# table.scale(1.2, 1.8) # Increase scaling for better spacing
# # Bold the header row
# for key, cell in table.get_celld().items():
# cell.set_text_props(wrap=True)
# cell.set_height(0.05) # Adjust row height to prevent overlapping
# if key[0] == 0: # First row is the header
# cell.set_fontsize(8)
# cell.set_text_props(weight='bold')
# # Save the current page to the PDF
# pdf.savefig(fig, bbox_inches='tight')
# plt.close(fig)
In [31]:
# # Save the DataFrame to PDF with bold headers
# pdf_file_path_bold_headers = '/Users/kvellian/My Drive/Bizi Projects/output_by_case.pdf'
# save_dataframe_to_pdf_adjusted_with_bold_headers(sorted_data, pdf_file_path_bold_headers)
In [ ]: