Construyendo Data Frame desde diccionarios y cargando datos del un data frame
Contents
Construyendo Data Frame desde diccionarios y cargando datos del un data frame¶
import numpy as np
import pandas as pd
import os
x = np.linspace(0, 10, 10)
y = np.linspace(0, 10, 10)
d = {"x": x, "y": y}
df = pd.DataFrame(d)
df.y
0 0.000000
1 1.111111
2 2.222222
3 3.333333
4 4.444444
5 5.555556
6 6.666667
7 7.777778
8 8.888889
9 10.000000
Name: y, dtype: float64
path = os.getcwd()
# https://www.kaggle.com/gpreda/covid-world-vaccination-progress?select=country_vaccinations
path="https://github.com/hernansalinas/Curso_aprendizaje_estadistico/blob/main/datasets/sesion_01b_country_vaccinations.xlsx?raw=true"
df = pd.read_excel(f"{path}")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File ~\anaconda3\envs\book\lib\site-packages\pandas\compat\_optional.py:126, in import_optional_dependency(name, extra, errors, min_version)
125 try:
--> 126 module = importlib.import_module(name)
127 except ImportError:
File ~\anaconda3\envs\book\lib\importlib\__init__.py:126, in import_module(name, package)
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1050, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1027, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1004, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'openpyxl'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Input In [5], in <cell line: 4>()
2 # https://www.kaggle.com/gpreda/covid-world-vaccination-progress?select=country_vaccinations
3 path="https://github.com/hernansalinas/Curso_aprendizaje_estadistico/blob/main/datasets/sesion_01b_country_vaccinations.xlsx?raw=true"
----> 4 df = pd.read_excel(f"{path}")
File ~\anaconda3\envs\book\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_base.py:457, in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
455 if not isinstance(io, ExcelFile):
456 should_close = True
--> 457 io = ExcelFile(io, storage_options=storage_options, engine=engine)
458 elif engine and engine != io.engine:
459 raise ValueError(
460 "Engine should not be specified when passing "
461 "an ExcelFile - ExcelFile already has the engine set"
462 )
File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_base.py:1419, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
1416 self.engine = engine
1417 self.storage_options = storage_options
-> 1419 self._reader = self._engines[engine](self._io, storage_options=storage_options)
File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_openpyxl.py:524, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
509 def __init__(
510 self,
511 filepath_or_buffer: FilePath | ReadBuffer[bytes],
512 storage_options: StorageOptions = None,
513 ) -> None:
514 """
515 Reader using openpyxl engine.
516
(...)
522 passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``)
523 """
--> 524 import_optional_dependency("openpyxl")
525 super().__init__(filepath_or_buffer, storage_options=storage_options)
File ~\anaconda3\envs\book\lib\site-packages\pandas\compat\_optional.py:129, in import_optional_dependency(name, extra, errors, min_version)
127 except ImportError:
128 if errors == "raise":
--> 129 raise ImportError(msg)
130 else:
131 return None
ImportError: Missing optional dependency 'openpyxl'. Use pip or conda to install openpyxl.
df.head()
country | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | vaccines | source_name | source_website | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | AFG | 2021-02-22 | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
1 | Afghanistan | AFG | 2021-02-23 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
2 | Afghanistan | AFG | 2021-02-24 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
3 | Afghanistan | AFG | 2021-02-25 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
4 | Afghanistan | AFG | 2021-02-26 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
# https://www.kaggle.com/sakhawat18/asteroid-dataset
# https://ssd.jpl.nasa.gov/tools/sbdb_query.html
path_git = "https://raw.githubusercontent.com/hernansalinas/Curso_aprendizaje_estadistico/main/datasets/sesion_01b_dataset.csv"
df = pd.read_csv(f"{path_git}")
df
Temperature (K) | Luminosity(L/Lo) | Radius(R/Ro) | Absolute magnitude(Mv) | Star type | Star color | Spectral Class | |
---|---|---|---|---|---|---|---|
0 | 3068 | 0.002400 | 0.1700 | 16.12 | 0 | Red | M |
1 | 3042 | 0.000500 | 0.1542 | 16.60 | 0 | Red | M |
2 | 2600 | 0.000300 | 0.1020 | 18.70 | 0 | Red | M |
3 | 2800 | 0.000200 | 0.1600 | 16.65 | 0 | Red | M |
4 | 1939 | 0.000138 | 0.1030 | 20.06 | 0 | Red | M |
... | ... | ... | ... | ... | ... | ... | ... |
235 | 38940 | 374830.000000 | 1356.0000 | -9.93 | 5 | Blue | O |
236 | 30839 | 834042.000000 | 1194.0000 | -10.63 | 5 | Blue | O |
237 | 8829 | 537493.000000 | 1423.0000 | -10.73 | 5 | White | A |
238 | 9235 | 404940.000000 | 1112.0000 | -11.23 | 5 | White | A |
239 | 37882 | 294903.000000 | 1783.0000 | -7.80 | 5 | Blue | O |
240 rows × 7 columns
Lectura de un dataset ubicado en el drive
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSHCOR8_Ha6TvBQwIcpjvJ0bzHYel1S8DXl4NHnMhVvdbibrgL_SP6rffuESpaJvPwLuUizXblQtHox/pub?output=csv"
df = pd.read_csv(url)
df
location | date | vaccine | total_vaccinations | |
---|---|---|---|---|
0 | Austria | 2021-01-08 | Johnson&Johnson | 0 |
1 | Austria | 2021-01-08 | Moderna | 0 |
2 | Austria | 2021-01-08 | Oxford/AstraZeneca | 0 |
3 | Austria | 2021-01-08 | Pfizer/BioNTech | 31284 |
4 | Austria | 2021-01-15 | Johnson&Johnson | 0 |
... | ... | ... | ... | ... |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
21894 | European Union | 2021-11-20 | Sinopharm/Beijing | 2157500 |
21895 | European Union | 2021-11-20 | Sinovac | 9 |
21896 | European Union | 2021-11-20 | Sputnik V | 1845062 |
21897 rows × 4 columns
Asignacion a la columna index la columna date
df = pd.read_csv(url, index_col="date")
df
location | vaccine | total_vaccinations | |
---|---|---|---|
date | |||
2021-01-08 | Austria | Johnson&Johnson | 0 |
2021-01-08 | Austria | Moderna | 0 |
2021-01-08 | Austria | Oxford/AstraZeneca | 0 |
2021-01-08 | Austria | Pfizer/BioNTech | 31284 |
2021-01-15 | Austria | Johnson&Johnson | 0 |
... | ... | ... | ... |
2021-11-20 | European Union | Oxford/AstraZeneca | 67212673 |
2021-11-20 | European Union | Pfizer/BioNTech | 438725101 |
2021-11-20 | European Union | Sinopharm/Beijing | 2157500 |
2021-11-20 | European Union | Sinovac | 9 |
2021-11-20 | European Union | Sputnik V | 1845062 |
21897 rows × 3 columns
Renombrar columnas
df1 = df.rename(columns={"location": "Location",
"vaccine":" Vaccine ",
"total_vaccinations":"Total Vaccinations"} ).copy() #inplace=True
df1
#Note el espacio en Vaccine
Location | Vaccine | Total Vaccinations | |
---|---|---|---|
date | |||
2021-01-08 | Austria | Johnson&Johnson | 0 |
2021-01-08 | Austria | Moderna | 0 |
2021-01-08 | Austria | Oxford/AstraZeneca | 0 |
2021-01-08 | Austria | Pfizer/BioNTech | 31284 |
2021-01-15 | Austria | Johnson&Johnson | 0 |
... | ... | ... | ... |
2021-11-20 | European Union | Oxford/AstraZeneca | 67212673 |
2021-11-20 | European Union | Pfizer/BioNTech | 438725101 |
2021-11-20 | European Union | Sinopharm/Beijing | 2157500 |
2021-11-20 | European Union | Sinovac | 9 |
2021-11-20 | European Union | Sputnik V | 1845062 |
21897 rows × 3 columns
Convirtiendo a minusculas todas las columnas
new_df = df1.rename(mapper = str.lower, axis="columns")
new_df
location | vaccine | total vaccinations | |
---|---|---|---|
date | |||
2021-01-08 | Austria | Johnson&Johnson | 0 |
2021-01-08 | Austria | Moderna | 0 |
2021-01-08 | Austria | Oxford/AstraZeneca | 0 |
2021-01-08 | Austria | Pfizer/BioNTech | 31284 |
2021-01-15 | Austria | Johnson&Johnson | 0 |
... | ... | ... | ... |
2021-11-20 | European Union | Oxford/AstraZeneca | 67212673 |
2021-11-20 | European Union | Pfizer/BioNTech | 438725101 |
2021-11-20 | European Union | Sinopharm/Beijing | 2157500 |
2021-11-20 | European Union | Sinovac | 9 |
2021-11-20 | European Union | Sputnik V | 1845062 |
21897 rows × 3 columns
new_df.columns
Index(['location', ' vaccine ', 'total vaccinations'], dtype='object')
Elimnando el espacio inicial de todas las columnas
new_df = new_df.rename(mapper = str.strip, axis="columns")
new_df
location | vaccine | total vaccinations | |
---|---|---|---|
date | |||
2021-01-08 | Austria | Johnson&Johnson | 0 |
2021-01-08 | Austria | Moderna | 0 |
2021-01-08 | Austria | Oxford/AstraZeneca | 0 |
2021-01-08 | Austria | Pfizer/BioNTech | 31284 |
2021-01-15 | Austria | Johnson&Johnson | 0 |
... | ... | ... | ... |
2021-11-20 | European Union | Oxford/AstraZeneca | 67212673 |
2021-11-20 | European Union | Pfizer/BioNTech | 438725101 |
2021-11-20 | European Union | Sinopharm/Beijing | 2157500 |
2021-11-20 | European Union | Sinovac | 9 |
2021-11-20 | European Union | Sputnik V | 1845062 |
21897 rows × 3 columns
inicializando el indice a valores enteros
new_df = new_df.reset_index()
new_df
date | location | vaccine | total vaccinations | |
---|---|---|---|---|
0 | 2021-01-08 | Austria | Johnson&Johnson | 0 |
1 | 2021-01-08 | Austria | Moderna | 0 |
2 | 2021-01-08 | Austria | Oxford/AstraZeneca | 0 |
3 | 2021-01-08 | Austria | Pfizer/BioNTech | 31284 |
4 | 2021-01-15 | Austria | Johnson&Johnson | 0 |
... | ... | ... | ... | ... |
21892 | 2021-11-20 | European Union | Oxford/AstraZeneca | 67212673 |
21893 | 2021-11-20 | European Union | Pfizer/BioNTech | 438725101 |
21894 | 2021-11-20 | European Union | Sinopharm/Beijing | 2157500 |
21895 | 2021-11-20 | European Union | Sinovac | 9 |
21896 | 2021-11-20 | European Union | Sputnik V | 1845062 |
21897 rows × 4 columns
Otra forma de generar el cambio
df1.columns
Index(['Location', ' Vaccine ', 'Total Vaccinations'], dtype='object')
cols = [c.lower().strip() for c in df1.columns]
df1.columns = cols
df1.columns
Index(['location', 'vaccine', 'total vaccinations'], dtype='object')
Pascal Case notation¶
#https://www.kaggle.com/saliblue/country-vaccinations-by-manufacturer
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSHCOR8_Ha6TvBQwIcpjvJ0bzHYel1S8DXl4NHnMhVvdbibrgL_SP6rffuESpaJvPwLuUizXblQtHox/pub?output=csv"
df = pd.read_csv(url)
df
location | date | vaccine | total_vaccinations | |
---|---|---|---|---|
0 | Austria | 2021-01-08 | Johnson&Johnson | 0 |
1 | Austria | 2021-01-08 | Moderna | 0 |
2 | Austria | 2021-01-08 | Oxford/AstraZeneca | 0 |
3 | Austria | 2021-01-08 | Pfizer/BioNTech | 31284 |
4 | Austria | 2021-01-15 | Johnson&Johnson | 0 |
... | ... | ... | ... | ... |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
21894 | European Union | 2021-11-20 | Sinopharm/Beijing | 2157500 |
21895 | European Union | 2021-11-20 | Sinovac | 9 |
21896 | European Union | 2021-11-20 | Sputnik V | 1845062 |
21897 rows × 4 columns
#df.columns
a = "hello world "
col = [c.capitalize() for c in a.split()]
col
['Hello', 'World']
df.columns
Index(['location', 'date', 'vaccine', 'total_vaccinations'], dtype='object')
df.columns = [c.replace("_"," ") for c in df.columns]
df.columns
Index(['location', 'date', 'vaccine', 'total vaccinations'], dtype='object')
Paso a paso para una expresion más compacta, ejemplo de PascaCase
a = [ cols for cols in df.columns ]
a
['location', 'date', 'vaccine', 'total vaccinations']
a = [ [c for c in cols.split()] for cols in df.columns ]
a
[['location'], ['date'], ['vaccine'], ['total', 'vaccinations']]
a = [ [c.capitalize() for c in cols.split()] for cols in df.columns ]
a
[['Location'], ['Date'], ['Vaccine'], ['Total', 'Vaccinations']]
a = ["adfads","Bsdfadf"]
" ".join(a)
'adfads Bsdfadf'
a =[ "".join([c.capitalize() for c in cols.split()]) for cols in df.columns ]
a
['Location', 'Date', 'Vaccine', 'TotalVaccinations']
cols=a
df.columns=cols
df
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
0 | Austria | 2021-01-08 | Johnson&Johnson | 0 |
1 | Austria | 2021-01-08 | Moderna | 0 |
2 | Austria | 2021-01-08 | Oxford/AstraZeneca | 0 |
3 | Austria | 2021-01-08 | Pfizer/BioNTech | 31284 |
4 | Austria | 2021-01-15 | Johnson&Johnson | 0 |
... | ... | ... | ... | ... |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
21894 | European Union | 2021-11-20 | Sinopharm/Beijing | 2157500 |
21895 | European Union | 2021-11-20 | Sinovac | 9 |
21896 | European Union | 2021-11-20 | Sputnik V | 1845062 |
21897 rows × 4 columns
Mascaras en columnas¶
tf = df["TotalVaccinations"] > 2157500
df[tf] #Mascara, nuevo data frame con un numero diferentes de lineas
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
67 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457 |
71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
... | ... | ... | ... | ... |
21886 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477 |
21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
21891 | European Union | 2021-11-20 | Moderna | 61206560 |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
7626 rows × 4 columns
df.where(tf) #asigna NAN a todo el data frame donde no se cumple la condicion establecida
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
0 | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673.0 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101.0 |
21894 | NaN | NaN | NaN | NaN |
21895 | NaN | NaN | NaN | NaN |
21896 | NaN | NaN | NaN | NaN |
21897 rows × 4 columns
n_df = df.where(tf).copy()
n_df.dropna() #Volvemos a obtener el data frame generado con la mascara y #filtrado como un array
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
67 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457.0 |
71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643.0 |
75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840.0 |
79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752.0 |
83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298.0 |
... | ... | ... | ... | ... |
21886 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477.0 |
21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779.0 |
21891 | European Union | 2021-11-20 | Moderna | 61206560.0 |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673.0 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101.0 |
7626 rows × 4 columns
Volviendo a la mascara
df = df[tf]
df
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
67 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457 |
71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
... | ... | ... | ... | ... |
21886 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477 |
21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
21891 | European Union | 2021-11-20 | Moderna | 61206560 |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
7626 rows × 4 columns
Comparacion para valores de una misma columna
df[ (df["TotalVaccinations"]>2273457) & (df["TotalVaccinations"]<61206560 ) ]
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
87 | Austria | 2021-06-04 | Pfizer/BioNTech | 4047114 |
... | ... | ... | ... | ... |
21876 | European Union | 2021-11-18 | Johnson&Johnson | 16867390 |
21877 | European Union | 2021-11-18 | Moderna | 60957149 |
21883 | European Union | 2021-11-19 | Johnson&Johnson | 16948949 |
21884 | European Union | 2021-11-19 | Moderna | 61165719 |
21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
6348 rows × 4 columns
df.reset_index()
index | Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|---|
0 | 67 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457 |
1 | 71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
2 | 75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
3 | 79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
4 | 83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
... | ... | ... | ... | ... | ... |
7621 | 21886 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477 |
7622 | 21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
7623 | 21891 | European Union | 2021-11-20 | Moderna | 61206560 |
7624 | 21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
7625 | 21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
7626 rows × 5 columns
q=df.set_index("Location")
q.reset_index()
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
0 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457 |
1 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
2 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
3 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
4 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
... | ... | ... | ... | ... |
7621 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477 |
7622 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
7623 | European Union | 2021-11-20 | Moderna | 61206560 |
7624 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
7625 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
7626 rows × 4 columns
unique()
df.Location.unique()
array(['Austria', 'Belgium', 'Chile', 'Croatia', 'Czechia', 'Denmark',
'Ecuador', 'Finland', 'France', 'Germany', 'Hong Kong', 'Hungary',
'Ireland', 'Italy', 'Japan', 'Lithuania', 'Netherlands', 'Norway',
'Poland', 'Portugal', 'Romania', 'Slovakia', 'South Korea',
'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United States',
'Uruguay', 'European Union'], dtype=object)
df.Vaccine.unique()
array(['Pfizer/BioNTech', 'Oxford/AstraZeneca', 'Sinovac', 'Moderna',
'Johnson&Johnson'], dtype=object)
Definir columnas
cols=['Location', 'Date', 'Vaccine']
df[cols]
Location | Date | Vaccine | |
---|---|---|---|
67 | Austria | 2021-04-30 | Pfizer/BioNTech |
71 | Austria | 2021-05-07 | Pfizer/BioNTech |
75 | Austria | 2021-05-14 | Pfizer/BioNTech |
79 | Austria | 2021-05-21 | Pfizer/BioNTech |
83 | Austria | 2021-05-28 | Pfizer/BioNTech |
... | ... | ... | ... |
21886 | European Union | 2021-11-19 | Pfizer/BioNTech |
21890 | European Union | 2021-11-20 | Johnson&Johnson |
21891 | European Union | 2021-11-20 | Moderna |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech |
7626 rows × 3 columns
g=df.set_index(['Location','Vaccine'])
g
Date | TotalVaccinations | ||
---|---|---|---|
Location | Vaccine | ||
Austria | Pfizer/BioNTech | 2021-04-30 | 2273457 |
Pfizer/BioNTech | 2021-05-07 | 2604643 | |
Pfizer/BioNTech | 2021-05-14 | 2904840 | |
Pfizer/BioNTech | 2021-05-21 | 3283752 | |
Pfizer/BioNTech | 2021-05-28 | 3620298 | |
... | ... | ... | ... |
European Union | Pfizer/BioNTech | 2021-11-19 | 438577477 |
Johnson&Johnson | 2021-11-20 | 16950779 | |
Moderna | 2021-11-20 | 61206560 | |
Oxford/AstraZeneca | 2021-11-20 | 67212673 | |
Pfizer/BioNTech | 2021-11-20 | 438725101 |
7626 rows × 2 columns
g.loc["Austria"]
Date | TotalVaccinations | |
---|---|---|
Vaccine | ||
Pfizer/BioNTech | 2021-04-30 | 2273457 |
Pfizer/BioNTech | 2021-05-07 | 2604643 |
Pfizer/BioNTech | 2021-05-14 | 2904840 |
Pfizer/BioNTech | 2021-05-21 | 3283752 |
Pfizer/BioNTech | 2021-05-28 | 3620298 |
Pfizer/BioNTech | 2021-06-04 | 4047114 |
Pfizer/BioNTech | 2021-06-11 | 4487274 |
Pfizer/BioNTech | 2021-06-18 | 4944088 |
Pfizer/BioNTech | 2021-06-25 | 5391151 |
Pfizer/BioNTech | 2021-07-02 | 5784489 |
Pfizer/BioNTech | 2021-07-09 | 6116071 |
Pfizer/BioNTech | 2021-07-16 | 6392275 |
Pfizer/BioNTech | 2021-07-23 | 6644089 |
Pfizer/BioNTech | 2021-07-30 | 6856749 |
Pfizer/BioNTech | 2021-08-06 | 7035563 |
Pfizer/BioNTech | 2021-08-13 | 7161389 |
Pfizer/BioNTech | 2021-08-20 | 7255573 |
Pfizer/BioNTech | 2021-08-27 | 7314987 |
Pfizer/BioNTech | 2021-09-03 | 7367498 |
Pfizer/BioNTech | 2021-09-10 | 7423706 |
Pfizer/BioNTech | 2021-09-17 | 7495432 |
Pfizer/BioNTech | 2021-09-24 | 7570056 |
Pfizer/BioNTech | 2021-10-01 | 7652494 |
Pfizer/BioNTech | 2021-10-08 | 7765350 |
Pfizer/BioNTech | 2021-10-15 | 7873668 |
Pfizer/BioNTech | 2021-10-22 | 8000912 |
Pfizer/BioNTech | 2021-10-29 | 8117682 |
Pfizer/BioNTech | 2021-11-05 | 8336536 |
Pfizer/BioNTech | 2021-11-12 | 8758674 |
Pfizer/BioNTech | 2021-11-19 | 9284152 |
Operacion groupby
df.groupby(["Location"])
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdb029cb790>
df.groupby(["Location"]).count()
Date | Vaccine | TotalVaccinations | |
---|---|---|---|
Location | |||
Austria | 30 | 30 | 30 |
Belgium | 51 | 51 | 51 |
Chile | 476 | 476 | 476 |
Croatia | 18 | 18 | 18 |
Czechia | 212 | 212 | 212 |
Denmark | 27 | 27 | 27 |
Ecuador | 287 | 287 | 287 |
European Union | 1019 | 1019 | 1019 |
Finland | 26 | 26 | 26 |
France | 720 | 720 | 720 |
Germany | 867 | 867 | 867 |
Hong Kong | 260 | 260 | 260 |
Hungary | 31 | 31 | 31 |
Ireland | 25 | 25 | 25 |
Italy | 711 | 711 | 711 |
Japan | 135 | 135 | 135 |
Lithuania | 3 | 3 | 3 |
Netherlands | 57 | 57 | 57 |
Norway | 25 | 25 | 25 |
Poland | 98 | 98 | 98 |
Portugal | 45 | 45 | 45 |
Romania | 240 | 240 | 240 |
Slovakia | 23 | 23 | 23 |
South Korea | 472 | 472 | 472 |
Spain | 100 | 100 | 100 |
Sweden | 31 | 31 | 31 |
Switzerland | 358 | 358 | 358 |
Ukraine | 264 | 264 | 264 |
United States | 841 | 841 | 841 |
Uruguay | 174 | 174 | 174 |
df.groupby(["Vaccine"]).count()
Location | Date | TotalVaccinations | |
---|---|---|---|
Vaccine | |||
Johnson&Johnson | 552 | 552 | 552 |
Moderna | 1499 | 1499 | 1499 |
Oxford/AstraZeneca | 1383 | 1383 | 1383 |
Pfizer/BioNTech | 3410 | 3410 | 3410 |
Sinovac | 782 | 782 | 782 |
part_df = df[df.Location=="Austria"].reset_index()
mask = df.Location.isnull()
df.fillna(0)
Location | Date | Vaccine | TotalVaccinations | |
---|---|---|---|---|
67 | Austria | 2021-04-30 | Pfizer/BioNTech | 2273457 |
71 | Austria | 2021-05-07 | Pfizer/BioNTech | 2604643 |
75 | Austria | 2021-05-14 | Pfizer/BioNTech | 2904840 |
79 | Austria | 2021-05-21 | Pfizer/BioNTech | 3283752 |
83 | Austria | 2021-05-28 | Pfizer/BioNTech | 3620298 |
... | ... | ... | ... | ... |
21886 | European Union | 2021-11-19 | Pfizer/BioNTech | 438577477 |
21890 | European Union | 2021-11-20 | Johnson&Johnson | 16950779 |
21891 | European Union | 2021-11-20 | Moderna | 61206560 |
21892 | European Union | 2021-11-20 | Oxford/AstraZeneca | 67212673 |
21893 | European Union | 2021-11-20 | Pfizer/BioNTech | 438725101 |
7626 rows × 4 columns
Series de tiempo https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv
Algunos repositorios y paginas de interes
https://www.nature.com/sdata/policies/repositories
https://paperswithcode.com/
https://towardsdatascience.com/31-datasets-for-your-next-data-science-project-6ef9a6f8cac6
https://www.data.gov/
https://archive.ics.uci.edu/ml/index.php
https://data.world/datasets/geodata
https://matmatch.com/advanced-search?categories=ceramic
https://github.com/sedaoturak/data-resources-for-materials-science
https://guides.library.cmu.edu/machine-learning/datasets