Skip to content

Commit

Permalink
[MRG] Add estela proxies (#231) (#234)
Browse files Browse the repository at this point in the history
* [MRG] Add bmc proxies (#231)
* Add Estela Proxies.

---------

Co-authored-by: mgonnav <[email protected]>

* Update RESERVED_PROXY_NAMES variable name. Add download size settings as variables
* Add PROXY_PROVIDERS_TO_TRACK, MAX_WEB_DOWNLOAD_SIZE_MB, and MAX_CLI_DOWNLOAD_CHUNK_MB to the API docs

---------

Co-authored-by: joaquin garmendia <[email protected]>
  • Loading branch information
mgonnav and joaquingx authored Nov 1, 2023
1 parent 6b3e6d5 commit 4a5ea7e
Show file tree
Hide file tree
Showing 42 changed files with 2,044 additions and 79 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ __pycache__/
.DS_Store

# Certificates
*.crt
*.crt

bitmaker_billing/
16 changes: 16 additions & 0 deletions docs/estela/installation/helm-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,22 @@ All the queue platform variables should be written as children of the _<QUEUE\_P
{: .note }
> The mailing configuration is used to send email regarding users creation on the estela system.
#### Data Downloads
* _<MAX\_CLI\_DOWNLOAD\_CHUNK\_MB>_ (Required): This is the maximum size of the chunks when downloading data
via the [estela-cli](https://estela-cli.bitmaker.la/). E.g., if this is set to a value of 2 and you download 1GB of data, 500 chunks would be
downloaded.
* _<MAX\_WEB\_DOWNLOAD\_SIZE\_MB>_ (Required): This is the maximum download size via Estela's web interface.
We recommend not setting this value higher than 2GB, and you should update the timeout value for your API
according to the value you set here. E.g., if you use `gunicorn`, you would add the `timeout` flag:
`gunicorn config.wsgi --bind=0.0.0.0:8000 --timeout=600`. We nencourage you to use the estela-cli for bigger
downloads.

#### Proxies

* _<PROXY\_PROVIDERS\_TO\_TRACK>_ (Optional): In Estela, you can add custom proxy providers you
can configure and reutilize in your projects, spiders, jobs and cronjobs. In this variable,
set the names of the proxy providers you want to track. E.g., `my_custom_proxy,my_other_custom_proxy`.

### estela queueing variables

* _<CONSUMER\_PRODUCTION>_ (Required): Set this value to `"False"` if the database
Expand Down
13 changes: 11 additions & 2 deletions estela-api/api/serializers/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,13 @@ class SpiderJobUpdateSerializer(serializers.ModelSerializer):
SpiderJob.RUNNING_STATUS,
]

job_fields = ["lifespan", "total_response_bytes", "item_count", "request_count"]
job_fields = [
"lifespan",
"total_response_bytes",
"item_count",
"request_count",
"proxy_usage_data",
]

class Meta:
model = SpiderJob
Expand All @@ -155,6 +161,7 @@ class Meta:
"request_count",
"data_status",
"data_expiry_days",
"proxy_usage_data",
)

def update(self, instance, validated_data):
Expand Down Expand Up @@ -188,7 +195,9 @@ def update(self, instance, validated_data):
instance.status = status

for field in self.job_fields:
if not getattr(instance, field):
if not getattr(instance, field) or getattr(
instance, field
) != validated_data.get(field):
new_value = validated_data.get(field, getattr(instance, field))
setattr(instance, field, new_value)

Expand Down
23 changes: 23 additions & 0 deletions estela-api/api/serializers/proxyprovider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from rest_framework import serializers
from core.models import ProxyProvider
from api.serializers.job_specific import SpiderJobEnvVarSerializer


class ProxyProviderSerializer(serializers.ModelSerializer):
class Meta:
model = ProxyProvider
fields = ["name", "description", "proxyid"]


class ProxyProviderUpdateSerializer(serializers.Serializer):
level = serializers.CharField(max_length=100, help_text="Spider or project")
project_or_spider_id = serializers.CharField(
max_length=100, help_text="Project id where the update will be performed"
)


class ProxyProviderResponseSerializer(serializers.Serializer):
success = serializers.BooleanField()
env_vars = SpiderJobEnvVarSerializer(
many=True, required=False, help_text="Env vars for the instace(project, spider)"
)
5 changes: 5 additions & 0 deletions estela-api/api/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
job_data as job_data_views,
stats as stats_views,
notification as notification_views,
proxyprovider as proxyprovider_views,
)

router = routers.DefaultRouter(trailing_slash=False)
Expand Down Expand Up @@ -58,6 +59,10 @@
viewset=stats_views.SpidersJobsStatsViewSet,
basename="spider-stats",
)
router.register(
prefix=r"proxy_provider",
viewset=proxyprovider_views.ProxyProviderViewSet,
)
router.register(prefix=r"auth", viewset=auth_views.AuthAPIViewSet, basename="auth")
router.register(
prefix=r"auth/profile", viewset=auth_views.UserProfileViewSet, basename="profile"
Expand Down
47 changes: 42 additions & 5 deletions estela-api/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from api import errors
from api.exceptions import DataBaseError
from config.job_manager import spiderdata_db_client
from core.models import SpiderJobEnvVar
from core.models import SpiderJobEnvVar, ProxyProvider


def update_env_vars(instance, env_vars, level="project"):
def update_env_vars(instance, env_vars, level="project", delete=True):
env_vars_instance = instance.env_vars.all()
for env_var in env_vars:
if env_vars_instance.filter(**env_var).exists():
Expand All @@ -29,9 +29,10 @@ def update_env_vars(instance, env_vars, level="project"):
elif level == "spider":
SpiderJobEnvVar.objects.create(spider=instance, **env_var)

for env_var in env_vars_instance:
if env_var.name not in [value["name"] for value in env_vars]:
env_var.delete()
if delete:
for env_var in env_vars_instance:
if env_var.name not in [value["name"] for value in env_vars]:
env_var.delete()


def update_stats_from_redis(job, save_to_database=False):
Expand Down Expand Up @@ -72,3 +73,39 @@ def delete_stats_from_redis(job):
redis_conn.delete(f"scrapy_stats_{job.key}")
except:
pass


def get_proxy_provider_envs(proxy_id):
proxy_provider = ProxyProvider.objects.get(pk=proxy_id)
proxy_attrs = [
"username",
"password",
"host",
"port",
"name",
]
fields_and_values = vars(proxy_provider)
replaces = {
"password": "pass",
"host": "url",
"username": "user",
}
env_vars = []
for field, value in fields_and_values.items():
if field in proxy_attrs:
name = replaces.get(field, field).upper()
if name != "NAME":
masked = True
else:
masked = False
env_vars.append(
{"name": f"ESTELA_PROXY_{name}", "value": value, "masked": masked}
)
env_vars.append(
{
"name": "ESTELA_PROXIES_ENABLED",
"value": "True",
"masked": False,
}
)
return env_vars
23 changes: 21 additions & 2 deletions estela-api/api/views/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
SpiderJobSerializer,
SpiderJobUpdateSerializer,
)
from api.utils import update_stats_from_redis
from api.utils import update_stats_from_redis, get_proxy_provider_envs
from config.job_manager import job_manager
from core.models import DataStatus, Project, Spider, SpiderJob
from core.models import DataStatus, Project, Spider, SpiderJob, ProxyProvider


class SpiderJobViewSet(
Expand Down Expand Up @@ -120,6 +120,25 @@ def create(self, request, *args, **kwargs):
job_env_vars = {
env_var.name: env_var.value for env_var in job.env_vars.all()
}

proxy_provider_names = [
(proxy.name, proxy.proxyid) for proxy in ProxyProvider.objects.all()
]
proxy_name = job_env_vars.get("ESTELA_PROXY_NAME")

if proxy_name:
proxy_id = next(
(tup[1] for tup in proxy_provider_names if proxy_name in tup), None
)
if proxy_id:
proxy_env_vars = get_proxy_provider_envs(proxy_id)
job_env_vars.update(
{
env_var["name"]: env_var["value"]
for env_var in proxy_env_vars
}
)

token = request.auth.key if request.auth else None
job_manager.create_job(
job.name,
Expand Down
4 changes: 2 additions & 2 deletions estela-api/api/views/job_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def list(self, request, *args, **kwargs):
elif request.META["HTTP_USER_AGENT"].startswith("estela-cli/"):
chunk_size = max(
1,
settings.MAX_CHUNK_SIZE
settings.MAX_CLI_DOWNLOAD_CHUNK_SIZE
// spiderdata_db_client.get_estimated_document_size(
kwargs["pid"], job_collection_name
),
Expand Down Expand Up @@ -207,7 +207,7 @@ def download(self, request, *args, **kwargs):
else:
docs_limit = max(
1,
(settings.MAX_WEB_DOWNLOAD_SIZE)
settings.MAX_WEB_DOWNLOAD_SIZE
// spiderdata_db_client.get_estimated_document_size(
kwargs["pid"], job_collection_name
),
Expand Down
85 changes: 85 additions & 0 deletions estela-api/api/views/proxyprovider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from rest_framework import viewsets, status
from rest_framework.response import Response
from rest_framework import serializers
from core.models import ProxyProvider, Project, Spider, SpiderJobEnvVar, SpiderJob
from api.serializers.proxyprovider import (
ProxyProviderUpdateSerializer,
ProxyProviderSerializer,
ProxyProviderResponseSerializer,
)
from api.serializers.job_specific import SpiderJobEnvVarSerializer
from api.mixins import BaseViewSet, ActionHandlerMixin
from drf_yasg.utils import swagger_auto_schema
from api.utils import update_env_vars

# from utils import update_env_vars


class ProxyProviderViewSet(BaseViewSet, viewsets.ModelViewSet, ActionHandlerMixin):
queryset = ProxyProvider.objects.all()
serializer_class = ProxyProviderSerializer

@swagger_auto_schema(
request_body=ProxyProviderUpdateSerializer, # Especifica el serializer para la solicitud
responses={
status.HTTP_200_OK: ProxyProviderResponseSerializer()
}, # Define las respuestas
)
def update(self, request, *args, **kwargs):
"In the request we should specify spider, project or job"
serializer = ProxyProviderUpdateSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
if serializer.validated_data["level"] == "project":
instance = Project.objects.get(
pk=serializer.validated_data["project_or_spider_id"]
)
elif serializer.validated_data["level"] == "spider":
instance = SpiderJob.objects.get(
pk=serializer.validated_data["project_or_spider_id"]
)
proxy_provider = self.get_object()
proxy_attrs = [
"username",
"password",
"host",
"port",
"name",
]
fields_and_values = vars(proxy_provider)
replaces = {
"password": "pass",
"host": "url",
"username": "user",
}
env_vars = []
for field, value in fields_and_values.items():
if field in proxy_attrs:
name = replaces.get(field, field).upper()
if name != "NAME":
masked = True
else:
masked = False
env_vars.append(
{"name": f"ESTELA_PROXY_{name}", "value": value, "masked": masked}
)
update_env_vars(
instance, env_vars, level=serializer.validated_data["level"], delete=False
)

if serializer.validated_data["level"] == "project":
env_vars_instance = SpiderJobEnvVar.objects.filter(
project_id=serializer.validated_data["project_or_spider_id"]
)
if serializer.validated_data["level"] == "spider":
env_vars_instance = SpiderJobEnvVar.objects.filter(
spider_id=serializer.validated_data["project_or_spider_id"]
)
env_vars_serialized = SpiderJobEnvVarSerializer(
env_vars_instance, required=False, many=True
)
resp_serializer = ProxyProviderResponseSerializer(
data={"success": True, "env_vars": env_vars_serialized.data}
)
# response_ser = ProxyProviderResponseSerializer(data=rspse)
resp_serializer.is_valid()
return Response(resp_serializer.data, status=status.HTTP_200_OK)
23 changes: 18 additions & 5 deletions estela-api/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
AWS_STORAGE_BUCKET_NAME=(str, "estela-django-api"),
GOOGLE_APPLICATION_CREDENTIALS=(str, "dummy"),
GOOGLE_APPLICATION_LOCATION=(str, "dummy"),
MAX_CLI_DOWNLOAD_CHUNK_MB=(int, 2),
MAX_WEB_DOWNLOAD_SIZE_MB=(int, 1024),
MULTI_NODE_MODE=(str, "False"),
BUCKET_NAME_PROJECTS=(str, "dummy"),
SECRET_KEY=(str, "dummy"),
Expand All @@ -63,6 +65,7 @@
EMAIL_HOST=(str, "dummy"),
EMAIL_PORT=(int, "dummy"),
VERIFICATION_EMAIL=(str, "dummy"),
PROXY_PROVIDERS_TO_TRACK=(str, ""),
)
environ.Env.read_env(env_file=".env")

Expand Down Expand Up @@ -210,8 +213,8 @@


# API limit data download settings (bytes)
MAX_CHUNK_SIZE = 2 * 1024 * 1024
MAX_WEB_DOWNLOAD_SIZE = 100 * 1024 * 1024
MAX_CLI_DOWNLOAD_CHUNK_SIZE = env("MAX_CLI_DOWNLOAD_CHUNK_MB") * 1024 * 1024
MAX_WEB_DOWNLOAD_SIZE = env("MAX_WEB_DOWNLOAD_SIZE_MB") * 1024 * 1024

# Pagination settings used in api_app
API_PAGE_SIZE = 100 # Paginator page size
Expand Down Expand Up @@ -280,7 +283,7 @@
CREDENTIALS = env("CREDENTIALS")
SPIDERDATA_DB_ENGINE = env("SPIDERDATA_DB_ENGINE")

# Spiderdata Database settings
# Spiderdata database settings
SPIDERDATA_DB_CONNECTION = env("SPIDERDATA_DB_CONNECTION")
SPIDERDATA_DB_PRODUCTION = True
SPIDERDATA_DB_CERTIFICATE_PATH = env("SPIDERDATA_DB_CERTIFICATE_PATH")
Expand All @@ -293,8 +296,18 @@
EMAIL_PORT = env("EMAIL_PORT")
EMAILS_TO_ALERT = env("EMAILS_TO_ALERT")

# Accept new Users
# Enable/disable the user register endpoint
REGISTER = env("REGISTER")

# Verification Email
# Verification email
VERIFICATION_EMAIL = env("VERIFICATION_EMAIL")

# Proxy Settings
PROXY_PROVIDERS_TO_TRACK = (
[]
if env("PROXY_PROVIDERS_TO_TRACK") == ""
else [
(name.replace("_", " ").title(), f"{name}_usage")
for name in env("PROXY_PROVIDERS_TO_TRACK").split(",")
]
)
Loading

0 comments on commit 4a5ea7e

Please sign in to comment.