diff --git a/DesafioDIO_ETL.ipynb b/DesafioDIO_ETL.ipynb new file mode 100644 index 0000000..2d10978 --- /dev/null +++ b/DesafioDIO_ETL.ipynb @@ -0,0 +1,1255 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JqxdSj5j0yuY", + "outputId": "e84b321c-9593-4962-f1c8-4d3baba0ea9b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Writing banco.csv\n" + ] + } + ], + "source": [ + "%%writefile banco.csv\n", + "age,job,marital,education,default,balance,housing,loan\n", + "34,unemployed,single,primary,no,1587,no,yes\n", + "34,services,married,secondary,no,3789,yes,yes\n", + "31,management,single,tertiary,no,1850,yes,no\n", + "32,management,married,tertiary,no,1476,yes,no\n", + "39,blue-collar,married,secondary,no,0,yes,no\n", + "35,management,single,tertiary,no,747,no,no\n", + "36,self-employed,married,tertiary,no,307,yes,no\n", + "38,technician,single,secondary,no,147,no,no\n", + "42,entrepreneur,married,tertiary,no,221,yes,no\n", + "45,services,married,primary,no,-88,,yes" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd" + ], + "metadata": { + "id": "14rznqrx1FcU" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv('banco.csv', na_values='na')" + ], + "metadata": { + "id": "8FXN7Il81MPL" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head(n=10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 362 + }, + "id": "xyudAlDR1Nva", + "outputId": "dc08e3c3-86f8-4b57-80e7-a6e44bc78265" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age job marital education default balance housing loan\n", + "0 34 unemployed single primary no 1587 no yes\n", + "1 34 services married secondary no 3789 yes yes\n", + "2 31 management single tertiary no 1850 yes no\n", + "3 32 management married tertiary no 1476 yes no\n", + "4 39 blue-collar married secondary no 0 yes no\n", + "5 35 management single tertiary no 747 no no\n", + "6 36 self-employed married tertiary no 307 yes no\n", + "7 38 technician single secondary no 147 no no\n", + "8 42 entrepreneur married tertiary no 221 yes no\n", + "9 45 services married primary no -88 NaN yes" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agejobmaritaleducationdefaultbalancehousingloan
034unemployedsingleprimaryno1587noyes
134servicesmarriedsecondaryno3789yesyes
231managementsingletertiaryno1850yesno
332managementmarriedtertiaryno1476yesno
439blue-collarmarriedsecondaryno0yesno
535managementsingletertiaryno747nono
636self-employedmarriedtertiaryno307yesno
738techniciansinglesecondaryno147nono
842entrepreneurmarriedtertiaryno221yesno
945servicesmarriedprimaryno-88NaNyes
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.dtypes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Z2eOai5X1Qv6", + "outputId": "1f5876f9-2b58-4610-e84d-6c342f8c31cf" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "age int64\n", + "job object\n", + "marital object\n", + "education object\n", + "default object\n", + "balance int64\n", + "housing object\n", + "loan object\n", + "dtype: object" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.select_dtypes('object').describe().transpose()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "FuFA7xQU1R3X", + "outputId": "a8069f88-b544-447c-cf41-d4d8a49bcfea" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " count unique top freq\n", + "job 10 7 management 3\n", + "marital 10 2 married 6\n", + "education 10 3 tertiary 5\n", + "default 10 1 no 10\n", + "housing 9 2 yes 6\n", + "loan 10 2 no 7" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countuniquetopfreq
job107management3
marital102married6
education103tertiary5
default101no10
housing92yes6
loan102no7
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.isna().any()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Wds8rTRQ1UeG", + "outputId": "a38cdefd-fddc-42cd-dffe-b6c1cb552136" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "age False\n", + "job False\n", + "marital False\n", + "education False\n", + "default False\n", + "balance False\n", + "housing True\n", + "loan False\n", + "dtype: bool" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.dropna(inplace=True)" + ], + "metadata": { + "id": "X-UghV8_1WRu" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head(n=10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 331 + }, + "id": "p4KdHbku1X6o", + "outputId": "6a846276-7eb1-4286-a148-71032e369618" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age job marital education default balance housing loan\n", + "0 34 unemployed single primary no 1587 no yes\n", + "1 34 services married secondary no 3789 yes yes\n", + "2 31 management single tertiary no 1850 yes no\n", + "3 32 management married tertiary no 1476 yes no\n", + "4 39 blue-collar married secondary no 0 yes no\n", + "5 35 management single tertiary no 747 no no\n", + "6 36 self-employed married tertiary no 307 yes no\n", + "7 38 technician single secondary no 147 no no\n", + "8 42 entrepreneur married tertiary no 221 yes no" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agejobmaritaleducationdefaultbalancehousingloan
034unemployedsingleprimaryno1587noyes
134servicesmarriedsecondaryno3789yesyes
231managementsingletertiaryno1850yesno
332managementmarriedtertiaryno1476yesno
439blue-collarmarriedsecondaryno0yesno
535managementsingletertiaryno747nono
636self-employedmarriedtertiaryno307yesno
738techniciansinglesecondaryno147nono
842entrepreneurmarriedtertiaryno221yesno
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "idades = []\n", + "with open(file='idades.csv', mode='w', encoding='utf8') as fp:\n", + " linha = 'idade' + '\\n'\n", + " fp.write(linha)\n", + " for idade in idades:\n", + " linha = str(idade) + '\\n'\n", + " fp.write(linha)" + ], + "metadata": { + "id": "_B4FBmxI1mYG" + }, + "execution_count": 13, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 6dc7e61..3f73402 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,62 @@ # Santander Dev Week 2023 Java API -RESTful API da Santander Dev Week 2023 construída em Java 17 com Spring Boot 3. - -## Principais Tecnologias - - **Java 17**: Utilizaremos a versão LTS mais recente do Java para tirar vantagem das últimas inovações que essa linguagem robusta e amplamente utilizada oferece; - - **Spring Boot 3**: Trabalharemos com a mais nova versão do Spring Boot, que maximiza a produtividade do desenvolvedor por meio de sua poderosa premissa de autoconfiguração; - - **Spring Data JPA**: Exploraremos como essa ferramenta pode simplificar nossa camada de acesso aos dados, facilitando a integração com bancos de dados SQL; - - **OpenAPI (Swagger)**: Vamos criar uma documentação de API eficaz e fácil de entender usando a OpenAPI (Swagger), perfeitamente alinhada com a alta produtividade que o Spring Boot oferece; - - **Railway**: facilita o deploy e monitoramento de nossas soluções na nuvem, além de oferecer diversos bancos de dados como serviço e pipelines de CI/CD. - -## [Link do Figma](https://www.figma.com/file/0ZsjwjsYlYd3timxqMWlbj/SANTANDER---Projeto-Web%2FMobile?type=design&node-id=1421%3A432&mode=design&t=6dPQuerScEQH0zAn-1) - -O Figma foi utilizado para a abstração do domínio desta API, sendo útil na análise e projeto da solução. - -## Diagrama de Classes (Domínio da API) - -```mermaid -classDiagram - class User { - -String name - -Account account - -Feature[] features - -Card card - -News[] news - } - - class Account { - -String number - -String agency - -Number balance - -Number limit - } - - class Feature { - -String icon - -String description - } - - class Card { - -String number - -Number limit - } - - class News { - -String icon - -String description - } - - User "1" *-- "1" Account - User "1" *-- "N" Feature - User "1" *-- "1" Card - User "1" *-- "N" News -``` - -## Documentação da API (Swagger) - -### [https://sdw-2023-prd.up.railway.app/swagger-ui.html](https://sdw-2023-prd.up.railway.app/swagger-ui.html) - -Esta API ficará disponível no Railway por um período de tempo limitado, mas este é um código-fonte aberto. Portanto, sintam-se à vontade para cloná-lo, modificá-lo (já que é um bom projeto base para novos projetos) e executar localmente ou onde achar mais interessante! Só não esquece de marcar a gente quando divulgar a sua solução 🥰 - -### IMPORTANTE - -Aos interessados no desenvolvimento da tela inicial do App do Santander (Figma) em Angular, Android, iOS ou Flutter... Caso a URL produtiva não esteja mais disponível, deixamos um Backup no GitHub Pages, é só dar um GET lá 😘 -- URL de Produção: https://sdw-2023-prd.up.railway.app/users/1 -- Mock (Backup): https://digitalinnovationone.github.io/santander-dev-week-2023-api/mocks/find_one.json +Desafio: "Explorando IA Generativa em um Pipeline de ETL com Python" parte do Santander Bootcamp 2023 - Ciência de Dados com Python + +## Etapas Realizadas +Arquivo CSV criado usando googgle collab: + +%%writefile banco.csv +age,job,marital,education,default,balance,housing,loan +34,unemployed,single,primary,no,1587,no,yes +34,services,married,secondary,no,3789,yes,yes +31,management,single,tertiary,no,1850,yes,no +32,management,married,tertiary,no,1476,yes,no +39,blue-collar,married,secondary,no,0,yes,no +35,management,single,tertiary,no,747,no,no +36,self-employed,married,tertiary,no,307,yes,no +38,technician,single,secondary,no,147,no,no +42,entrepreneur,married,tertiary,no,221,yes,no +45,services,married,primary,no,-88,,yes + + +#### Extração +Extrair as informações de idades do arquivo anterior: + +import pandas as pd +df = pd.read_csv('banco.csv', na_values='na') +df.head(n=10) + +Analisando as informações + +df.dtypes +df.select_dtypes('object').describe().transpose() + +#### Transform + +Verificar se alguma coluna tem info faltante: +df.isna().any() + +Remover a linha com informação incompleta: +df.dropna(inplace=True) + +Revisar informações após ajuste: +df.head(n=10) + +#### Load + +Extrair as informações de idades do arquivo aque foi ajustado e salvar como novo arquivo: + +idades = [] +with open(file='idades.csv', mode='w', encoding='utf8') as fp: + linha = 'idade' + '\n' + fp.write(linha) + for idade in idades: + linha = str(idade) + '\n' + fp.write(linha) + + + + + + +