rows, cols = df.shape
mem_bytes = int(df.memory_usage(deep=True).sum())
mem_mb = round(mem_bytes / (1024*1024), 2)
| Column | Flags | Type | Unique Values | Unique Ratio | Missing (%) |
|---|---|---|---|---|---|
| vin | id-like | object | 22337 | 0.999284 | 0.0 |
# Build Integrity Notes lists and full flagged table
flag_rows = []
for c in df.columns:
f = []
if c in id_like: f.append('id-like')
if c in hi_card: f.append('high-card')
if c in consts: f.append('constant')
if f:
flag_rows.append({
'Column': c,
'Flags': ', '.join(f),
'Type': dtypes[c],
'Unique Values': int(uniques.get(c,0)),
'Unique Ratio': float(uniq_ratio.get(c,0)),
'Missing (%)': float(miss_pct.get(c,0))
})
flagged_df = pd.DataFrame(flag_rows)
flagged_df
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 | Chrysler | PT Cruiser | Classic | Wagon | automatic | 3a4gy5f97at172213 | tn | 26 | 67969 | gray | gray | tdaf remarketing | 5225 | 5500 | Wed Jun 03 2015 03:30:00 GMT-0700 (PDT) |
| 2008 | Nissan | Altima | 3.5 SE | Sedan | automatic | 1n4bl21ex8c168024 | pa | 2 | 126302 | gray | black | valley national bank | 5625 | 3800 | Tue May 26 2015 02:30:00 GMT-0700 (PDT) |
| 2012 | Jeep | Liberty | Sport | SUV | automatic | 1c4pjlak6cw100859 | tx | 2 | 68499 | green | black | north america financial | 10300 | 7900 | Tue Jan 20 2015 02:30:00 GMT-0800 (PST) |
| 2009 | Jeep | Wrangler | Unlimited X | suv | automatic | 1j4ga391x9l706276 | fl | 36 | 63592 | yellow | gray | tdaf remarketing | 17800 | 19400 | Mon Jun 08 2015 06:00:00 GMT-0700 (PDT) |
| 2013 | Nissan | Juke | S | Hatchback | automatic | jn8af5mrxdt213006 | tx | 39 | 33185 | white | black | nissan motor acceptance corporation | 13800 | 13000 | Wed May 27 2015 03:30:00 GMT-0700 (PDT) |
| 2013 | BMW | 5 Series | 528i | Sedan | automatic | wbaxg5c51ddy30612 | tn | 42 | 18011 | white | gray | harold mathews nissan inc | 43200 | 32000 | Wed Jun 24 2015 03:00:00 GMT-0700 (PDT) |
| 2008 | Audi | A4 | 2.0T | Sedan | automatic | wauaf78e48a080112 | nc | 37 | 103193 | blue | black | hendrick toyota scion concord | 8100 | 8100 | Wed Feb 25 2015 06:00:00 GMT-0800 (PST) |
| 2009 | Mercedes-Benz | C-Class | C300 Luxury | sedan | automatic | wddgf54x29r044131 | tx | 41 | 99506 | gray | beige | mercedes-benz of houston north | 12300 | 10200 | Thu May 28 2015 03:30:00 GMT-0700 (PDT) |
df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']].head(8)
| year | condition | odometer | mmr | sellingprice |
|---|---|---|---|---|
| 22353.000000 | 21879.000000 | 22350.000000 | 22352.000000 | 22353.000000 |
| 2010.007516 | 30.649618 | 68717.950067 | 13619.834914 | 13469.380173 |
| 3.979315 | 13.345850 | 53200.460518 | 9596.389362 | 9648.996468 |
| 1991.000000 | 1.000000 | 1.000000 | 25.000000 | 200.000000 |
| 2007.000000 | 23.000000 | 28430.500000 | 7000.000000 | 6800.000000 |
| 2012.000000 | 34.000000 | 52922.500000 | 12150.000000 | 12000.000000 |
| 2013.000000 | 41.000000 | 99079.250000 | 18100.000000 | 18000.000000 |
| 2015.000000 | 49.000000 | 999999.000000 | 170000.000000 | 165000.000000 |
df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'sellingprice', 'saledate']].describe()
| Column | Type | Non-Null Count | Missing (%) | Unique Values | Sample Values | Flags |
|---|---|---|---|---|---|---|
| year | int64 | 22353 | 0.0 | 25 | 2010, 2008, 2012 | |
| make | object | 21953 | 1.8 | 73 | Chrysler, Nissan, Jeep | |
| model | object | 21950 | 1.8 | 644 | PT Cruiser, Altima, Liberty | |
| trim | object | 21938 | 1.9 | 941 | Classic, 3.5 SE, Sport | |
| body | object | 21827 | 2.4 | 62 | Wagon, Sedan, SUV | |
| transmission | object | 19787 | 11.5 | 3 | automatic, manual, sedan | |
| vin | object | 22353 | 0.0 | 22337 | 3a4gy5f97at172213, 1n4bl21ex8c168024, 1c4pjlak6cw100859 | id-like |
| state | object | 22353 | 0.0 | 38 | tn, pa, tx | |
| condition | float64 | 21879 | 2.1 | 41 | 26.0, 2.0, 36.0 | |
| odometer | float64 | 22350 | 0.0 | 20639 | 67969.0, 126302.0, 68499.0 | |
| color | object | 22324 | 0.1 | 19 | gray, green, yellow | |
| interior | object | 22324 | 0.1 | 17 | gray, black, beige | |
| seller | object | 22353 | 0.0 | 3578 | tdaf remarketing, valley national bank, north america financial | |
| mmr | float64 | 22352 | 0.0 | 962 | 5225.0, 5625.0, 10300.0 | |
| sellingprice | float64 | 22353 | 0.0 | 781 | 5500.0, 3800.0, 7900.0 | |
| saledate | object | 22353 | 0.0 | 1828 | Wed Jun 03 2015 03:30:00 GMT-0700 (PDT), Tue May 26 2015 02:30:00 GMT-0700 (PDT), Tue Jan 20 2015 02:30:00 GMT-0800 (PST) |
dtypes = df.dtypes.astype(str)
nonnull = df.notnull().sum()
miss_pct = (df.isnull().mean()*100).round(1)
uniques = df.nunique(dropna=True)
n = max(len(df), 1)
uniq_ratio = (uniques / n).fillna(0.0)
def _sample_vals(s, k=3):
vals = pd.unique(s.dropna().astype(str))[:k]
return ', '.join(map(str, vals)) if len(vals) else ''
flags_col = []
for c in df.columns:
flags=[]
if uniques.get(c,0) <= 1: flags.append('constant')
if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')
if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')
flags_col.append(', '.join(flags))
profile_df = pd.DataFrame({
'Column': df.columns,
'Type': dtypes.values,
'Non-Null Count': nonnull.values,
'Missing (%)': miss_pct.values,
'Unique Values': uniques.values,
'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],
'Flags': flags_col
})
profile_df
| Column | Type | Non-Null Count | Unique Values |
|---|---|---|---|
| year | int64 | 22353 | 25 |
| make | object | 21953 | 73 |
| model | object | 21950 | 644 |
| trim | object | 21938 | 941 |
| body | object | 21827 | 62 |
| transmission | object | 19787 | 3 |
| vin | object | 22353 | 22337 |
| state | object | 22353 | 38 |
| condition | float64 | 21879 | 41 |
| odometer | float64 | 22350 | 20639 |
| color | object | 22324 | 19 |
| interior | object | 22324 | 17 |
| seller | object | 22353 | 3578 |
| mmr | float64 | 22352 | 962 |
| sellingprice | float64 | 22353 | 781 |
| saledate | object | 22353 | 1828 |
pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes.astype(str).values,
'Non-Null Count': df.notnull().sum().values,
'Unique Values': df.nunique().values
})
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2012 | Ferrari | California | Base | Convertible | automatic | zff65lja6c0185410 | fl | 45.0 | 10180.0 | red | tan | the collection | 148000.0 | 141000.0 | Thu Feb 19 2015 01:15:00 GMT-0800 (PST) |
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2014 | BMW | M6 | Base | Convertible | automatic | wbslz9c55ed651003 | nj | 49.0 | 3509.0 | black | gray | bmw na manheim new jersey | 102000.0 | 103000.0 | Tue Feb 10 2015 06:00:00 GMT-0800 (PST) |
| year | make | model | trim | body | transmission | vin | state | condition | odometer | color | interior | seller | mmr | sellingprice | saledate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2014 | Porsche | Cayenne | Turbo S | SUV | automatic | wp1ac2a28ela85651 | fl | 25.0 | 16068.0 | black | black | mercedes benz of miami | 96000.0 | 74500.0 | Tue Mar 10 2015 02:15:00 GMT-0700 (PDT) |
num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]
df_num = df[num_cols]
df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num
med = df_num.median(); mad = (df_num - med).abs().median()
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
row_score = rz.abs().max(axis=1)
top3 = row_score.nlargest(3)
dfn = df.select_dtypes(include='number').iloc[:, :max_cols]
dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn
# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box
| Column | Missing Values | Missing (%) |
|---|---|---|
| make | 400 | 1.8 |
| model | 403 | 1.8 |
| trim | 415 | 1.9 |
| body | 526 | 2.4 |
| transmission | 2566 | 11.5 |
| condition | 474 | 2.1 |
| odometer | 3 | 0.0 |
| color | 29 | 0.1 |
| interior | 29 | 0.1 |
| mmr | 1 | 0.0 |
nulls = df.isnull().sum()
nulls_pct = (df.isnull().mean() * 100).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})\nmissing_df[missing_df['Missing Values'] > 0]
nulls = df.isnull().sum();
nulls_pct = (
df.isnull().mean()*100
).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})
top_miss = (
missing_df[missing_df['Missing Values'] > 0]
.sort_values('Missing (%)', ascending=False)
.loc[:, ['Column', 'Missing (%)']]
.head(20)
.reset_index(drop=True)
)
top_miss
dist_col = '<chosen categorical>'
s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')
vc = s.value_counts(dropna=False)
top_k = 8 # Top-8 + Other (+ Missing)