## definitios
import pandas as pd
import numpy as np
import seaborn as sns

## human readable numbers
pd.set_option('display.float_format', lambda x: '%.3f' % x)
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale = 1, color_codes=True)


WholeData = pd.read_csv('data/cars.csv')

## show all possible columns
WholeData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1513200 entries, 0 to 1513199
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   brand                 1513200 non-null  object 
 1   name                  1513200 non-null  object 
 2   bodyType              1513200 non-null  object 
 3   color                 1403466 non-null  object 
 4   fuelType              1509640 non-null  object 
 5   year                  1102226 non-null  float64
 6   mileage               1498720 non-null  float64
 7   transmission          1510135 non-null  object 
 8   power                 1492313 non-null  float64
 9   price                 1513200 non-null  int64  
 10  vehicleConfiguration  1102226 non-null  object 
 11  engineName            1101142 non-null  object 
 12  engineDisplacement    1092435 non-null  object 
 13  date                  1513200 non-null  object 
 14  location              1513200 non-null  object 
 15  link                  1513200 non-null  object 
 16  parse_date            1513200 non-null  object 
dtypes: float64(3), int64(1), object(13)
memory usage: 196.3+ MB


## Since data source is from 2022, the age of the car is calculated as 2022 - year
customConverter = {'getAge': lambda x: (2022 - int(float(x))) if x != '' else np.nan, 'toNumber': lambda x: int(float(x)) if x != '' else np.nan}

## reduction to columns of interest
YearPriceDF = pd.read_csv('data/cars.csv', usecols=['year','mileage','price','brand','name'], converters={'year':customConverter['getAge'], 'price':customConverter['toNumber']}).dropna().rename(columns={'year':'age'})

# ## reduction to cars younger than 30 years
YearPriceDF = YearPriceDF[YearPriceDF['age'] < 30]


## pick specific car
YearPriceDF_BMW = YearPriceDF[YearPriceDF['brand'] == 'BMW']
YearPriceDF_BMW_X5 = YearPriceDF_BMW[YearPriceDF_BMW['name'] == 'X5']

## show all possible columns
YearPriceDF_BMW_X5.info()

## basic statistics
YearPriceDF_BMW_X5.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 3269 entries, 268 to 1512843
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   brand    3269 non-null   object 
 1   name     3269 non-null   object 
 2   age      3269 non-null   float64
 3   mileage  3269 non-null   float64
 4   price    3269 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 153.2+ KB


p = sns.pairplot(YearPriceDF_BMW_X5, x_vars=['age', 'mileage'], y_vars='price', height=8, kind='reg', aspect=1)


from scipy import stats

price = np.array(YearPriceDF_BMW_X5[["price"]]).reshape(-1)
age = np.array(YearPriceDF_BMW_X5[["age"]]).reshape(-1)
mileage = np.array(YearPriceDF_BMW_X5[["mileage"]]).reshape(-1)

slope, intercept, r_value, p_value, std_err = stats.linregress(age, price)

print()
print("Age / Price")
print(f"koeficient determinace: {r_value * r_value:.3f}")
print(f"p-hodnota: {p_value:.5f}")
print(f"sklon: {slope:.3f} -> v CZK: {slope * 0.24:.3f}")

slope, intercept, r_value, p_value, std_err = stats.linregress(mileage, price)

print()
print("Mileage / Price")
print(f"koeficient determinace: {r_value * r_value:.3f}")
print(f"p-hodnota: {p_value:.5f}")
print(f"sklon: {slope:.3f} -> v CZK: {slope * 0.24:.3f}")

Age / Price
koeficient determinace: 0.619
p-hodnota: 0.00000
sklon: -516057.576 -> v CZK: -123853.818

Mileage / Price
koeficient determinace: 0.599
p-hodnota: 0.00000
sklon: -35.749 -> v CZK: -8.580


YearPriceDF_MB = YearPriceDF[YearPriceDF['brand'] == 'Mercedes-Benz']
YearPriceDF_MB_Sclass = YearPriceDF_MB[YearPriceDF_MB['name'] == 'S-Class']

## show all possible columns
YearPriceDF_MB_Sclass.info()

## basic statistics
YearPriceDF_MB_Sclass.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 4033 entries, 323 to 1512784
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   brand    4033 non-null   object 
 1   name     4033 non-null   object 
 2   age      4033 non-null   float64
 3   mileage  4033 non-null   float64
 4   price    4033 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 189.0+ KB


p = sns.pairplot(YearPriceDF_MB_Sclass, x_vars=['age', 'mileage'], y_vars='price', height=8, kind='reg', aspect=1)


from scipy import stats

price = np.array(YearPriceDF_MB_Sclass[["price"]]).reshape(-1)
age = np.array(YearPriceDF_MB_Sclass[["age"]]).reshape(-1)
mileage = np.array(YearPriceDF_MB_Sclass[["mileage"]]).reshape(-1)

slope, intercept, r_value, p_value, std_err = stats.linregress(age, price)

print()
print("Age / Price")
print(f"koeficient determinace: {r_value * r_value:.3f}")
print(f"p-hodnota: {p_value:.5f}")
print(f"sklon: {slope:.3f} -> v CZK: {slope * 0.24:.3f}")

slope, intercept, r_value, p_value, std_err = stats.linregress(mileage, price)

print()
print("Mileage / Price")
print(f"koeficient determinace: {r_value * r_value:.3f}")
print(f"p-hodnota: {p_value:.5f}")
print(f"sklon: {slope:.3f} -> v CZK: {slope * 0.24:.3f}")

Age / Price
koeficient determinace: 0.371
p-hodnota: 0.00000
sklon: -309744.856 -> v CZK: -74338.765

Mileage / Price
koeficient determinace: 0.402
p-hodnota: 0.00000
sklon: -32.262 -> v CZK: -7.743

	age	mileage	price
count	3269.000	3269.000	3269.000
mean	8.899	110590.089	4985671.145
std	6.186	87815.786	4057037.092
min	2.000	1000.000	250000.000
25%	4.000	37000.000	1400000.000
50%	6.000	90000.000	3950000.000
75%	16.000	160000.000	7150000.000
max	23.000	352000.000	16900000.000

	age	mileage	price
count	4033.000	4033.000	4033.000
mean	9.720	105483.015	3685761.831
std	6.298	62897.843	3200848.072
min	1.000	1000.000	400000.000
25%	4.000	70000.000	1490000.000
50%	7.000	100000.000	3750000.000
75%	16.000	128000.000	3990000.000
max	27.000	460000.000	21000000.000

Statistická práce: Prodej automobilů¶

Data¶

Cíl práce¶

Lineární regrese - ověření nepřímé úměry mezi cenou a stářím¶

Vizualizace dat¶

Závěr¶