fix(scan): resolve detection of the first endpoint in the initiate sc…

…an task - Replace HTTPx first scan by nmap, then launch HTTPx with discovered port - Create a reusable function to launch nmap on the fly - Add parsing to get ports and services from Nmap output - Add more logs to debug scans while running - Remove the HTTP CRAWL global var, Nmap is the default to retrieve the first endpoint (the starting point for all the others tasks) - Adjust the is_alive parameter for tasks that need alive endpoints - Fix S3 scanner source file not found - Add more checks to prevent errors and scan crash - Refactor Endpoint saving for a better logic and less errors - Improve URLs validation
Security-Tools-Alliance · Nov 22, 2024 · 023e36b · 023e36b
1 parent a3918b7
commit 023e36b
Show file tree

Hide file tree

Showing 6 changed files with 593 additions and 260 deletions.
diff --git a/web/reNgine/celery_custom_task.py b/web/reNgine/celery_custom_task.py
@@ -109,7 +109,10 @@ def __call__(self, *args, **kwargs):
 
 			# Create ScanActivity for this task and send start scan notifs
 			if self.track:
-				logger.warning(f'Task {self.task_name} is RUNNING')
+				if self.domain:
+					logger.warning(f'Task {self.task_name} for {self.subdomain.name if self.subdomain else self.domain.name} is RUNNING')
+				else:
+					logger.warning(f'Task {self.task_name} is RUNNING')
 				self.create_scan_activity()
 
 		if RENGINE_CACHE_ENABLED:
@@ -119,7 +122,10 @@ def __call__(self, *args, **kwargs):
 			if result and result != b'null':
 				self.status = SUCCESS_TASK
 				if RENGINE_RECORD_ENABLED and self.track:
-					logger.warning(f'Task {self.task_name} status is SUCCESS (CACHED)')
+					if self.domain:
+						logger.warning(f'Task {self.task_name} for {self.subdomain.name if self.subdomain else self.domain.name} status is SUCCESS (CACHED)')
+					else:
+						logger.warning(f'Task {self.task_name} status is SUCCESS (CACHED)')
 					self.update_scan_activity()
 				return json.loads(result)
 
@@ -150,7 +156,10 @@ def __call__(self, *args, **kwargs):
 			self.write_results()
 
 			if RENGINE_RECORD_ENABLED and self.track:
-				msg = f'Task {self.task_name} status is {self.status_str}'
+				if self.domain:
+					msg = f'Task {self.task_name} for {self.subdomain.name if self.subdomain else self.domain.name} status is {self.status_str}'
+				else:
+					msg = f'Task {self.task_name} status is {self.status_str}'
 				msg += f' | Error: {self.error}' if self.error else ''
 				logger.warning(msg)
 				self.update_scan_activity()

diff --git a/web/reNgine/common_func.py b/web/reNgine/common_func.py
@@ -506,6 +506,45 @@ def extract_path_from_url(url):
 
 	return reconstructed_url
 
+def is_valid_url(url):
+    """Check if a URL is valid, including both full URLs and domain:port format.
+    
+    Args:
+        url (str): URL to validate (https://domain.com or domain.com:port)
+        
+    Returns:
+        bool: True if valid URL, False otherwise
+    """
+    logger.debug(f'Validating URL: {url}')
+
+    # Handle URLs with scheme (http://, https://)
+    if url.startswith(('http://', 'https://')):
+        return validators.url(url)
+
+    # Handle domain:port format
+    try:
+        if ':' in url:
+            domain, port = url.rsplit(':', 1)
+            # Validate port
+            port = int(port)
+            if not 1 <= port <= 65535:
+                logger.debug(f'Invalid port number: {port}')
+                return False
+        else:
+            domain = url
+
+        # Validate domain
+        if validators.domain(domain) or validators.ipv4(domain) or validators.ipv6(domain):
+            logger.debug(f'Valid domain/IP found: {domain}')
+            return True
+
+        logger.debug(f'Invalid domain/IP: {domain}')
+        return False
+
+    except (ValueError, ValidationError) as e:
+        logger.debug(f'Validation error: {str(e)}')
+        return False
+
 #-------#
 # Utils #
 #-------#
@@ -878,7 +917,7 @@ def get_task_cache_key(func_name, *args, **kwargs):
 
 
 def get_output_file_name(scan_history_id, subscan_id, filename):
-	title = f'#{scan_history_id}'
+	title = f'{scan_history_id}'
 	if subscan_id:
 		title += f'-{subscan_id}'
 	title += f'_{filename}'
@@ -925,21 +964,28 @@ def get_nmap_cmd(
 		script=None,
 		script_args=None,
 		max_rate=None,
-		service_detection=True,
 		flags=[]):
-	if not cmd:
-		cmd = 'nmap'
 
+	# Initialize base options
 	options = {
-		"-sV": service_detection,
-		"-p": ports,
+		"--max-rate": max_rate,
+		"-oX": output_file,
 		"--script": script,
 		"--script-args": script_args,
-		"--max-rate": max_rate,
-		"-oX": output_file
 	}
+
+	if not cmd:
+		cmd = 'nmap'
+		# Update options with nmap specific parameters
+		options.update({
+			"-sV": "",
+			"-p": ports,
+		})
+
+	# Build command with options
 	cmd = _build_cmd(cmd, options, flags)
 
+	# Add input source
 	if not input_file:
 		cmd += f" {host}" if host else ""
 	else:
@@ -1352,4 +1398,23 @@ def get_ips_from_cidr_range(target):
         return [str(ip) for ip in ipaddress.IPv4Network(target)]
     except ValueError:
         logger.error(f'{target} is not a valid CIDR range. Skipping.')
-        return []
+        return []
+
+def get_http_crawl_value(engine, config):
+    """Get HTTP crawl value from config.
+    
+    Args:
+        engine: EngineType object
+        config: Configuration dictionary or None
+        
+    Returns:
+        bool: True if HTTP crawl is enabled
+    """
+    # subscan engine value
+    enable_http_crawl = config.get(ENABLE_HTTP_CRAWL) if config else None
+    if enable_http_crawl is None:
+        # scan engine value
+        yaml_config = yaml.safe_load(engine.yaml_configuration)
+        enable_http_crawl = yaml_config.get(ENABLE_HTTP_CRAWL, DEFAULT_ENABLE_HTTP_CRAWL)
+    logger.debug(f'Enable HTTP crawl: {enable_http_crawl}')
+    return enable_http_crawl
diff --git a/web/reNgine/settings.py b/web/reNgine/settings.py
@@ -45,7 +45,7 @@
 DOMAIN_NAME = env('DOMAIN_NAME', default='localhost:8000')
 TEMPLATE_DEBUG = env.bool('TEMPLATE_DEBUG', default=False)
 SECRET_FILE = os.path.join(RENGINE_HOME, 'secret')
-DEFAULT_ENABLE_HTTP_CRAWL = env.bool('DEFAULT_ENABLE_HTTP_CRAWL', default=True)
+DEFAULT_ENABLE_HTTP_CRAWL = env.bool('DEFAULT_ENABLE_HTTP_CRAWL', default=False)
 DEFAULT_RATE_LIMIT = env.int('DEFAULT_RATE_LIMIT', default=150) # requests / second
 DEFAULT_HTTP_TIMEOUT = env.int('DEFAULT_HTTP_TIMEOUT', default=5) # seconds
 DEFAULT_RETRIES = env.int('DEFAULT_RETRIES', default=1)