Construyendo Data Frame desde diccionarios y cargando datos del un data frame

Open In Colab

Construyendo Data Frame desde diccionarios y cargando datos del un data frame

import numpy as np
import pandas as pd
import os
x = np.linspace(0, 10, 10)
y = np.linspace(0, 10, 10)

d = {"x": x, "y": y}
df = pd.DataFrame(d)
df.y
0     0.000000
1     1.111111
2     2.222222
3     3.333333
4     4.444444
5     5.555556
6     6.666667
7     7.777778
8     8.888889
9    10.000000
Name: y, dtype: float64
path = os.getcwd()
# https://www.kaggle.com/gpreda/covid-world-vaccination-progress?select=country_vaccinations
path="https://github.com/hernansalinas/Curso_aprendizaje_estadistico/blob/main/datasets/sesion_01b_country_vaccinations.xlsx?raw=true"
df = pd.read_excel(f"{path}") 
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~\anaconda3\envs\book\lib\site-packages\pandas\compat\_optional.py:126, in import_optional_dependency(name, extra, errors, min_version)
    125 try:
--> 126     module = importlib.import_module(name)
    127 except ImportError:

File ~\anaconda3\envs\book\lib\importlib\__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1050, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1027, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1004, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'openpyxl'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Input In [5], in <cell line: 4>()
      2 # https://www.kaggle.com/gpreda/covid-world-vaccination-progress?select=country_vaccinations
      3 path="https://github.com/hernansalinas/Curso_aprendizaje_estadistico/blob/main/datasets/sesion_01b_country_vaccinations.xlsx?raw=true"
----> 4 df = pd.read_excel(f"{path}")

File ~\anaconda3\envs\book\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_base.py:457, in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, decimal, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
    455 if not isinstance(io, ExcelFile):
    456     should_close = True
--> 457     io = ExcelFile(io, storage_options=storage_options, engine=engine)
    458 elif engine and engine != io.engine:
    459     raise ValueError(
    460         "Engine should not be specified when passing "
    461         "an ExcelFile - ExcelFile already has the engine set"
    462     )

File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_base.py:1419, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
   1416 self.engine = engine
   1417 self.storage_options = storage_options
-> 1419 self._reader = self._engines[engine](self._io, storage_options=storage_options)

File ~\anaconda3\envs\book\lib\site-packages\pandas\io\excel\_openpyxl.py:524, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
    509 def __init__(
    510     self,
    511     filepath_or_buffer: FilePath | ReadBuffer[bytes],
    512     storage_options: StorageOptions = None,
    513 ) -> None:
    514     """
    515     Reader using openpyxl engine.
    516 
   (...)
    522         passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``)
    523     """
--> 524     import_optional_dependency("openpyxl")
    525     super().__init__(filepath_or_buffer, storage_options=storage_options)

File ~\anaconda3\envs\book\lib\site-packages\pandas\compat\_optional.py:129, in import_optional_dependency(name, extra, errors, min_version)
    127 except ImportError:
    128     if errors == "raise":
--> 129         raise ImportError(msg)
    130     else:
    131         return None

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
df.head()
country iso_code date total_vaccinations people_vaccinated people_fully_vaccinated daily_vaccinations_raw daily_vaccinations total_vaccinations_per_hundred people_vaccinated_per_hundred people_fully_vaccinated_per_hundred daily_vaccinations_per_million vaccines source_name source_website
0 Afghanistan AFG 2021-02-22 0.0 0.0 NaN NaN NaN 0.0 0.0 NaN NaN Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
1 Afghanistan AFG 2021-02-23 NaN NaN NaN NaN 1367.0 NaN NaN NaN 35.0 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
2 Afghanistan AFG 2021-02-24 NaN NaN NaN NaN 1367.0 NaN NaN NaN 35.0 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
3 Afghanistan AFG 2021-02-25 NaN NaN NaN NaN 1367.0 NaN NaN NaN 35.0 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
4 Afghanistan AFG 2021-02-26 NaN NaN NaN NaN 1367.0 NaN NaN NaN 35.0 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
 # https://www.kaggle.com/sakhawat18/asteroid-dataset 
 # https://ssd.jpl.nasa.gov/tools/sbdb_query.html
 
 path_git = "https://raw.githubusercontent.com/hernansalinas/Curso_aprendizaje_estadistico/main/datasets/sesion_01b_dataset.csv"
 df = pd.read_csv(f"{path_git}")
 df
Temperature (K) Luminosity(L/Lo) Radius(R/Ro) Absolute magnitude(Mv) Star type Star color Spectral Class
0 3068 0.002400 0.1700 16.12 0 Red M
1 3042 0.000500 0.1542 16.60 0 Red M
2 2600 0.000300 0.1020 18.70 0 Red M
3 2800 0.000200 0.1600 16.65 0 Red M
4 1939 0.000138 0.1030 20.06 0 Red M
... ... ... ... ... ... ... ...
235 38940 374830.000000 1356.0000 -9.93 5 Blue O
236 30839 834042.000000 1194.0000 -10.63 5 Blue O
237 8829 537493.000000 1423.0000 -10.73 5 White A
238 9235 404940.000000 1112.0000 -11.23 5 White A
239 37882 294903.000000 1783.0000 -7.80 5 Blue O

240 rows × 7 columns

Lectura de un dataset ubicado en el drive

url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSHCOR8_Ha6TvBQwIcpjvJ0bzHYel1S8DXl4NHnMhVvdbibrgL_SP6rffuESpaJvPwLuUizXblQtHox/pub?output=csv"
df = pd.read_csv(url)
df
location date vaccine total_vaccinations
0 Austria 2021-01-08 Johnson&Johnson 0
1 Austria 2021-01-08 Moderna 0
2 Austria 2021-01-08 Oxford/AstraZeneca 0
3 Austria 2021-01-08 Pfizer/BioNTech 31284
4 Austria 2021-01-15 Johnson&Johnson 0
... ... ... ... ...
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101
21894 European Union 2021-11-20 Sinopharm/Beijing 2157500
21895 European Union 2021-11-20 Sinovac 9
21896 European Union 2021-11-20 Sputnik V 1845062

21897 rows × 4 columns

Asignacion a la columna index la columna date

df = pd.read_csv(url, index_col="date")
df
location vaccine total_vaccinations
date
2021-01-08 Austria Johnson&Johnson 0
2021-01-08 Austria Moderna 0
2021-01-08 Austria Oxford/AstraZeneca 0
2021-01-08 Austria Pfizer/BioNTech 31284
2021-01-15 Austria Johnson&Johnson 0
... ... ... ...
2021-11-20 European Union Oxford/AstraZeneca 67212673
2021-11-20 European Union Pfizer/BioNTech 438725101
2021-11-20 European Union Sinopharm/Beijing 2157500
2021-11-20 European Union Sinovac 9
2021-11-20 European Union Sputnik V 1845062

21897 rows × 3 columns

Renombrar columnas

df1 = df.rename(columns={"location": "Location", 
                         "vaccine":" Vaccine ", 
                         "total_vaccinations":"Total Vaccinations"} ).copy()  #inplace=True
df1
#Note el espacio en Vaccine
Location Vaccine Total Vaccinations
date
2021-01-08 Austria Johnson&Johnson 0
2021-01-08 Austria Moderna 0
2021-01-08 Austria Oxford/AstraZeneca 0
2021-01-08 Austria Pfizer/BioNTech 31284
2021-01-15 Austria Johnson&Johnson 0
... ... ... ...
2021-11-20 European Union Oxford/AstraZeneca 67212673
2021-11-20 European Union Pfizer/BioNTech 438725101
2021-11-20 European Union Sinopharm/Beijing 2157500
2021-11-20 European Union Sinovac 9
2021-11-20 European Union Sputnik V 1845062

21897 rows × 3 columns

Convirtiendo a minusculas todas las columnas

new_df = df1.rename(mapper = str.lower, axis="columns")
new_df
location vaccine total vaccinations
date
2021-01-08 Austria Johnson&Johnson 0
2021-01-08 Austria Moderna 0
2021-01-08 Austria Oxford/AstraZeneca 0
2021-01-08 Austria Pfizer/BioNTech 31284
2021-01-15 Austria Johnson&Johnson 0
... ... ... ...
2021-11-20 European Union Oxford/AstraZeneca 67212673
2021-11-20 European Union Pfizer/BioNTech 438725101
2021-11-20 European Union Sinopharm/Beijing 2157500
2021-11-20 European Union Sinovac 9
2021-11-20 European Union Sputnik V 1845062

21897 rows × 3 columns

new_df.columns
Index(['location', ' vaccine ', 'total vaccinations'], dtype='object')

Elimnando el espacio inicial de todas las columnas

new_df = new_df.rename(mapper = str.strip, axis="columns")
new_df
location vaccine total vaccinations
date
2021-01-08 Austria Johnson&Johnson 0
2021-01-08 Austria Moderna 0
2021-01-08 Austria Oxford/AstraZeneca 0
2021-01-08 Austria Pfizer/BioNTech 31284
2021-01-15 Austria Johnson&Johnson 0
... ... ... ...
2021-11-20 European Union Oxford/AstraZeneca 67212673
2021-11-20 European Union Pfizer/BioNTech 438725101
2021-11-20 European Union Sinopharm/Beijing 2157500
2021-11-20 European Union Sinovac 9
2021-11-20 European Union Sputnik V 1845062

21897 rows × 3 columns

inicializando el indice a valores enteros

new_df = new_df.reset_index()
new_df
date location vaccine total vaccinations
0 2021-01-08 Austria Johnson&Johnson 0
1 2021-01-08 Austria Moderna 0
2 2021-01-08 Austria Oxford/AstraZeneca 0
3 2021-01-08 Austria Pfizer/BioNTech 31284
4 2021-01-15 Austria Johnson&Johnson 0
... ... ... ... ...
21892 2021-11-20 European Union Oxford/AstraZeneca 67212673
21893 2021-11-20 European Union Pfizer/BioNTech 438725101
21894 2021-11-20 European Union Sinopharm/Beijing 2157500
21895 2021-11-20 European Union Sinovac 9
21896 2021-11-20 European Union Sputnik V 1845062

21897 rows × 4 columns

Otra forma de generar el cambio

df1.columns
Index(['Location', ' Vaccine ', 'Total Vaccinations'], dtype='object')
cols = [c.lower().strip() for c in df1.columns]
df1.columns = cols
df1.columns
Index(['location', 'vaccine', 'total vaccinations'], dtype='object')

img

Pascal Case notation

#https://www.kaggle.com/saliblue/country-vaccinations-by-manufacturer
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSHCOR8_Ha6TvBQwIcpjvJ0bzHYel1S8DXl4NHnMhVvdbibrgL_SP6rffuESpaJvPwLuUizXblQtHox/pub?output=csv"
df = pd.read_csv(url)
df
location date vaccine total_vaccinations
0 Austria 2021-01-08 Johnson&Johnson 0
1 Austria 2021-01-08 Moderna 0
2 Austria 2021-01-08 Oxford/AstraZeneca 0
3 Austria 2021-01-08 Pfizer/BioNTech 31284
4 Austria 2021-01-15 Johnson&Johnson 0
... ... ... ... ...
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101
21894 European Union 2021-11-20 Sinopharm/Beijing 2157500
21895 European Union 2021-11-20 Sinovac 9
21896 European Union 2021-11-20 Sputnik V 1845062

21897 rows × 4 columns

#df.columns
a = "hello world "
col = [c.capitalize() for c in a.split()]
col
['Hello', 'World']
df.columns
Index(['location', 'date', 'vaccine', 'total_vaccinations'], dtype='object')
df.columns = [c.replace("_"," ") for c in df.columns]
df.columns
Index(['location', 'date', 'vaccine', 'total vaccinations'], dtype='object')

Paso a paso para una expresion más compacta, ejemplo de PascaCase

a = [ cols  for cols in df.columns ]
a
['location', 'date', 'vaccine', 'total vaccinations']
a = [ [c  for c in cols.split()]   for cols in df.columns ]
a
[['location'], ['date'], ['vaccine'], ['total', 'vaccinations']]
a = [ [c.capitalize()  for c in cols.split()]   for cols in df.columns ]
a
[['Location'], ['Date'], ['Vaccine'], ['Total', 'Vaccinations']]
a = ["adfads","Bsdfadf"]
" ".join(a)
'adfads Bsdfadf'
a =[ "".join([c.capitalize() for c in cols.split()])  for cols in df.columns ]
a
['Location', 'Date', 'Vaccine', 'TotalVaccinations']
cols=a
df.columns=cols
df
Location Date Vaccine TotalVaccinations
0 Austria 2021-01-08 Johnson&Johnson 0
1 Austria 2021-01-08 Moderna 0
2 Austria 2021-01-08 Oxford/AstraZeneca 0
3 Austria 2021-01-08 Pfizer/BioNTech 31284
4 Austria 2021-01-15 Johnson&Johnson 0
... ... ... ... ...
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101
21894 European Union 2021-11-20 Sinopharm/Beijing 2157500
21895 European Union 2021-11-20 Sinovac 9
21896 European Union 2021-11-20 Sputnik V 1845062

21897 rows × 4 columns

Mascaras en columnas

tf = df["TotalVaccinations"] > 2157500
df[tf] #Mascara, nuevo data frame con un numero diferentes de lineas
Location Date Vaccine TotalVaccinations
67 Austria 2021-04-30 Pfizer/BioNTech 2273457
71 Austria 2021-05-07 Pfizer/BioNTech 2604643
75 Austria 2021-05-14 Pfizer/BioNTech 2904840
79 Austria 2021-05-21 Pfizer/BioNTech 3283752
83 Austria 2021-05-28 Pfizer/BioNTech 3620298
... ... ... ... ...
21886 European Union 2021-11-19 Pfizer/BioNTech 438577477
21890 European Union 2021-11-20 Johnson&Johnson 16950779
21891 European Union 2021-11-20 Moderna 61206560
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101

7626 rows × 4 columns

df.where(tf)  #asigna NAN a todo el data frame donde no se cumple la condicion establecida
Location Date Vaccine TotalVaccinations
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
... ... ... ... ...
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673.0
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101.0
21894 NaN NaN NaN NaN
21895 NaN NaN NaN NaN
21896 NaN NaN NaN NaN

21897 rows × 4 columns

n_df = df.where(tf).copy()
n_df.dropna()  #Volvemos a obtener el data frame generado con la mascara y #filtrado como un array
Location Date Vaccine TotalVaccinations
67 Austria 2021-04-30 Pfizer/BioNTech 2273457.0
71 Austria 2021-05-07 Pfizer/BioNTech 2604643.0
75 Austria 2021-05-14 Pfizer/BioNTech 2904840.0
79 Austria 2021-05-21 Pfizer/BioNTech 3283752.0
83 Austria 2021-05-28 Pfizer/BioNTech 3620298.0
... ... ... ... ...
21886 European Union 2021-11-19 Pfizer/BioNTech 438577477.0
21890 European Union 2021-11-20 Johnson&Johnson 16950779.0
21891 European Union 2021-11-20 Moderna 61206560.0
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673.0
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101.0

7626 rows × 4 columns

Volviendo a la mascara

df = df[tf]
df
Location Date Vaccine TotalVaccinations
67 Austria 2021-04-30 Pfizer/BioNTech 2273457
71 Austria 2021-05-07 Pfizer/BioNTech 2604643
75 Austria 2021-05-14 Pfizer/BioNTech 2904840
79 Austria 2021-05-21 Pfizer/BioNTech 3283752
83 Austria 2021-05-28 Pfizer/BioNTech 3620298
... ... ... ... ...
21886 European Union 2021-11-19 Pfizer/BioNTech 438577477
21890 European Union 2021-11-20 Johnson&Johnson 16950779
21891 European Union 2021-11-20 Moderna 61206560
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101

7626 rows × 4 columns

Comparacion para valores de una misma columna

df[ (df["TotalVaccinations"]>2273457) & (df["TotalVaccinations"]<61206560 ) ]
Location Date Vaccine TotalVaccinations
71 Austria 2021-05-07 Pfizer/BioNTech 2604643
75 Austria 2021-05-14 Pfizer/BioNTech 2904840
79 Austria 2021-05-21 Pfizer/BioNTech 3283752
83 Austria 2021-05-28 Pfizer/BioNTech 3620298
87 Austria 2021-06-04 Pfizer/BioNTech 4047114
... ... ... ... ...
21876 European Union 2021-11-18 Johnson&Johnson 16867390
21877 European Union 2021-11-18 Moderna 60957149
21883 European Union 2021-11-19 Johnson&Johnson 16948949
21884 European Union 2021-11-19 Moderna 61165719
21890 European Union 2021-11-20 Johnson&Johnson 16950779

6348 rows × 4 columns

df.reset_index()
index Location Date Vaccine TotalVaccinations
0 67 Austria 2021-04-30 Pfizer/BioNTech 2273457
1 71 Austria 2021-05-07 Pfizer/BioNTech 2604643
2 75 Austria 2021-05-14 Pfizer/BioNTech 2904840
3 79 Austria 2021-05-21 Pfizer/BioNTech 3283752
4 83 Austria 2021-05-28 Pfizer/BioNTech 3620298
... ... ... ... ... ...
7621 21886 European Union 2021-11-19 Pfizer/BioNTech 438577477
7622 21890 European Union 2021-11-20 Johnson&Johnson 16950779
7623 21891 European Union 2021-11-20 Moderna 61206560
7624 21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
7625 21893 European Union 2021-11-20 Pfizer/BioNTech 438725101

7626 rows × 5 columns

q=df.set_index("Location")
q.reset_index()
Location Date Vaccine TotalVaccinations
0 Austria 2021-04-30 Pfizer/BioNTech 2273457
1 Austria 2021-05-07 Pfizer/BioNTech 2604643
2 Austria 2021-05-14 Pfizer/BioNTech 2904840
3 Austria 2021-05-21 Pfizer/BioNTech 3283752
4 Austria 2021-05-28 Pfizer/BioNTech 3620298
... ... ... ... ...
7621 European Union 2021-11-19 Pfizer/BioNTech 438577477
7622 European Union 2021-11-20 Johnson&Johnson 16950779
7623 European Union 2021-11-20 Moderna 61206560
7624 European Union 2021-11-20 Oxford/AstraZeneca 67212673
7625 European Union 2021-11-20 Pfizer/BioNTech 438725101

7626 rows × 4 columns

unique()

df.Location.unique()
array(['Austria', 'Belgium', 'Chile', 'Croatia', 'Czechia', 'Denmark',
       'Ecuador', 'Finland', 'France', 'Germany', 'Hong Kong', 'Hungary',
       'Ireland', 'Italy', 'Japan', 'Lithuania', 'Netherlands', 'Norway',
       'Poland', 'Portugal', 'Romania', 'Slovakia', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United States',
       'Uruguay', 'European Union'], dtype=object)
df.Vaccine.unique()
array(['Pfizer/BioNTech', 'Oxford/AstraZeneca', 'Sinovac', 'Moderna',
       'Johnson&Johnson'], dtype=object)

Definir columnas

cols=['Location', 'Date', 'Vaccine']
df[cols]
Location Date Vaccine
67 Austria 2021-04-30 Pfizer/BioNTech
71 Austria 2021-05-07 Pfizer/BioNTech
75 Austria 2021-05-14 Pfizer/BioNTech
79 Austria 2021-05-21 Pfizer/BioNTech
83 Austria 2021-05-28 Pfizer/BioNTech
... ... ... ...
21886 European Union 2021-11-19 Pfizer/BioNTech
21890 European Union 2021-11-20 Johnson&Johnson
21891 European Union 2021-11-20 Moderna
21892 European Union 2021-11-20 Oxford/AstraZeneca
21893 European Union 2021-11-20 Pfizer/BioNTech

7626 rows × 3 columns

g=df.set_index(['Location','Vaccine'])
g
Date TotalVaccinations
Location Vaccine
Austria Pfizer/BioNTech 2021-04-30 2273457
Pfizer/BioNTech 2021-05-07 2604643
Pfizer/BioNTech 2021-05-14 2904840
Pfizer/BioNTech 2021-05-21 3283752
Pfizer/BioNTech 2021-05-28 3620298
... ... ... ...
European Union Pfizer/BioNTech 2021-11-19 438577477
Johnson&Johnson 2021-11-20 16950779
Moderna 2021-11-20 61206560
Oxford/AstraZeneca 2021-11-20 67212673
Pfizer/BioNTech 2021-11-20 438725101

7626 rows × 2 columns

g.loc["Austria"]
Date TotalVaccinations
Vaccine
Pfizer/BioNTech 2021-04-30 2273457
Pfizer/BioNTech 2021-05-07 2604643
Pfizer/BioNTech 2021-05-14 2904840
Pfizer/BioNTech 2021-05-21 3283752
Pfizer/BioNTech 2021-05-28 3620298
Pfizer/BioNTech 2021-06-04 4047114
Pfizer/BioNTech 2021-06-11 4487274
Pfizer/BioNTech 2021-06-18 4944088
Pfizer/BioNTech 2021-06-25 5391151
Pfizer/BioNTech 2021-07-02 5784489
Pfizer/BioNTech 2021-07-09 6116071
Pfizer/BioNTech 2021-07-16 6392275
Pfizer/BioNTech 2021-07-23 6644089
Pfizer/BioNTech 2021-07-30 6856749
Pfizer/BioNTech 2021-08-06 7035563
Pfizer/BioNTech 2021-08-13 7161389
Pfizer/BioNTech 2021-08-20 7255573
Pfizer/BioNTech 2021-08-27 7314987
Pfizer/BioNTech 2021-09-03 7367498
Pfizer/BioNTech 2021-09-10 7423706
Pfizer/BioNTech 2021-09-17 7495432
Pfizer/BioNTech 2021-09-24 7570056
Pfizer/BioNTech 2021-10-01 7652494
Pfizer/BioNTech 2021-10-08 7765350
Pfizer/BioNTech 2021-10-15 7873668
Pfizer/BioNTech 2021-10-22 8000912
Pfizer/BioNTech 2021-10-29 8117682
Pfizer/BioNTech 2021-11-05 8336536
Pfizer/BioNTech 2021-11-12 8758674
Pfizer/BioNTech 2021-11-19 9284152

Operacion groupby

df.groupby(["Location"])
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdb029cb790>
df.groupby(["Location"]).count()
Date Vaccine TotalVaccinations
Location
Austria 30 30 30
Belgium 51 51 51
Chile 476 476 476
Croatia 18 18 18
Czechia 212 212 212
Denmark 27 27 27
Ecuador 287 287 287
European Union 1019 1019 1019
Finland 26 26 26
France 720 720 720
Germany 867 867 867
Hong Kong 260 260 260
Hungary 31 31 31
Ireland 25 25 25
Italy 711 711 711
Japan 135 135 135
Lithuania 3 3 3
Netherlands 57 57 57
Norway 25 25 25
Poland 98 98 98
Portugal 45 45 45
Romania 240 240 240
Slovakia 23 23 23
South Korea 472 472 472
Spain 100 100 100
Sweden 31 31 31
Switzerland 358 358 358
Ukraine 264 264 264
United States 841 841 841
Uruguay 174 174 174
df.groupby(["Vaccine"]).count()
Location Date TotalVaccinations
Vaccine
Johnson&Johnson 552 552 552
Moderna 1499 1499 1499
Oxford/AstraZeneca 1383 1383 1383
Pfizer/BioNTech 3410 3410 3410
Sinovac 782 782 782
part_df = df[df.Location=="Austria"].reset_index()
mask = df.Location.isnull()
df.fillna(0)
Location Date Vaccine TotalVaccinations
67 Austria 2021-04-30 Pfizer/BioNTech 2273457
71 Austria 2021-05-07 Pfizer/BioNTech 2604643
75 Austria 2021-05-14 Pfizer/BioNTech 2904840
79 Austria 2021-05-21 Pfizer/BioNTech 3283752
83 Austria 2021-05-28 Pfizer/BioNTech 3620298
... ... ... ... ...
21886 European Union 2021-11-19 Pfizer/BioNTech 438577477
21890 European Union 2021-11-20 Johnson&Johnson 16950779
21891 European Union 2021-11-20 Moderna 61206560
21892 European Union 2021-11-20 Oxford/AstraZeneca 67212673
21893 European Union 2021-11-20 Pfizer/BioNTech 438725101

7626 rows × 4 columns

Series de tiempo https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv

Algunos repositorios y paginas de interes

  1. https://www.nature.com/sdata/policies/repositories

  2. https://paperswithcode.com/

  3. https://towardsdatascience.com/31-datasets-for-your-next-data-science-project-6ef9a6f8cac6

  4. https://www.data.gov/

  5. https://archive.ics.uci.edu/ml/index.php

  6. https://data.world/datasets/geodata

  7. https://matmatch.com/advanced-search?categories=ceramic

  8. https://github.com/sedaoturak/data-resources-for-materials-science

  9. https://guides.library.cmu.edu/machine-learning/datasets