diff --git a/src/collector/UtilityOps.py b/src/collector/UtilityOps.py
deleted file mode 100644
index c211e6f..0000000
--- a/src/collector/UtilityOps.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-class UtilityOps:
- @staticmethod
- def GetClientIP(request):
- try:
- x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
- if x_forwarded_for:
- ip = x_forwarded_for.split(',')[0]
- else:
- ip = request.META.get('REMOTE_ADDR')
- return ip
- except Exception:
- return None
-
-
- @staticmethod
- def GetDictValues(dict, key, default):
- if dict and key in dict:
- return dict[key]
- return default
-
-
diff --git a/src/collector/models.py b/src/collector/models.py
index 977346f..a59c2f3 100644
--- a/src/collector/models.py
+++ b/src/collector/models.py
@@ -1,127 +1,71 @@
from django.db import models
-
+from hashlib import sha256
class PuzzlePiece(models.Model):
- url = models.URLField(verbose_name="image url")
- hash = models.CharField(max_length=64, unique=True, default="empty", verbose_name="sha256 hash of the url")
- ip_address = models.CharField(max_length=64, default="?.?.?.?", verbose_name="hash of submitter ip address")
- submitted_date = models.DateTimeField(verbose_name="submitted date", auto_now_add=True)
- last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
- approved = models.NullBooleanField(verbose_name="is image approved for verification")
- priority = models.PositiveIntegerField(default=0,verbose_name="Priority value in transcription queue")
- transCount = models.PositiveIntegerField(default=0,verbose_name="Number of transcriptions received for this image")
-
- def __str__(self):
- data = []
- data.append("URL: {}".format(self.url))
- data.append("ip_address: {}".format(self.ip_address))
- data.append("submitted_date: {}".format(self.submitted_date))
- data.append("last_modified: {}".format(self.last_modified))
- data.append("hash: {}".format(self.hash))
- data.append("transCount: {}".format(self.transCount))
+ url = models.URLField(verbose_name="Image url")
+ hash = models.CharField(max_length=64, unique=True, verbose_name="SHA256 hash of the url", default=None, null=True, blank=True)
+ submitter = models.CharField(max_length=64, verbose_name="Hash of submitter ip address", default="", blank=True)
+ submission_date = models.DateTimeField(verbose_name="Submission date", auto_now_add=True)
+ priority = models.PositiveIntegerField(verbose_name="Priority value in transcription queue", default=0)
+ confidence = models.PositiveIntegerField(verbose_name="Confidence score", default=0, blank=True)
- result = ""
- for d in data:
- result += "
{}".format(d)
- result = ""
- return result
+ def save(self, *args, **kwargs):
+ self.hash = self.calculate_hash()
+ super().save(*args, **kwargs)
+ def calculate_hash(self):
+ return sha256(str(self.url).encode('utf-8')).hexdigest()
-class TranscriptionData(models.Model):
+class Transcription(models.Model):
class Meta:
indexes = [
models.Index(fields=['ip_address'], name='ip_address_idx')
]
-
- puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="transcriptions")
- ip_address = models.CharField(max_length=64, default="?.?.?.?", verbose_name="hash of submitter ip address")
- submitted_date = models.DateTimeField(verbose_name="submitted date", auto_now=True)
-
- bad_image = models.BooleanField(verbose_name="image is bad or hard to read")
- orientation = models.CharField(max_length=10, default="", verbose_name="orientation direction from image")
- rawdata = models.TextField(default="", verbose_name="Raw JSON taken in for later debugging.")
- datahash = models.CharField(max_length=64, default="", verbose_name="sha256 hash for easier comparisons")
-
- center = models.CharField(max_length=20, verbose_name="center")
-
- wall1 = models.BooleanField(verbose_name="wall 1 (top)")
- wall2 = models.BooleanField(verbose_name="wall 2 (top-right)")
- wall3 = models.BooleanField(verbose_name="wall 3 (bottom-right)")
- wall4 = models.BooleanField(verbose_name="wall 4 (bottom)")
- wall5 = models.BooleanField(verbose_name="wall 5 (bottom-left)")
- wall6 = models.BooleanField(verbose_name="wall 6 (top-left)")
-
- link1 = models.CharField(max_length=7, verbose_name="link 1 (top)")
- link2 = models.CharField(max_length=7, verbose_name="link 2 (top-right)")
- link3 = models.CharField(max_length=7, verbose_name="link 3 (bottom-right)")
- link4 = models.CharField(max_length=7, verbose_name="link 4 (bottom)")
- link5 = models.CharField(max_length=7, verbose_name="link 5 (bottom-left)")
- link6 = models.CharField(max_length=7, verbose_name="link 6 (top-left)")
-
- def __str__(self):
- return "{} {} {}".format(self.center, self.wall1, self.link1)
-
-
-class BadImage(models.Model):
- puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="badimages")
- last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
- badCount = models.PositiveIntegerField(default=0,verbose_name="how often this image was reported as bad")
-
- class Meta:
- unique_together = ('id', 'puzzlePiece',)
-
-class RotatedImage(models.Model):
- puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="rotatedimages")
- last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
- rotatedCount = models.PositiveIntegerField(default=0,verbose_name="how often this image was reported as incorrectly rotated")
-
- class Meta:
- unique_together = ('id', 'puzzlePiece',)
-
-class ConfidenceTracking(models.Model):
- puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="confidences")
- last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
- confidence = models.PositiveIntegerField(default=0,verbose_name="how confident are we in this image, 0 to 100")
-
- class Meta:
- unique_together = ('id', 'puzzlePiece',)
-
-
-class ConfidentSolution(models.Model):
- puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="confidentsolutions")
- last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
- confidence = models.PositiveIntegerField(default=0,verbose_name="how confident are we in this image, 0 to 100")
- datahash = models.CharField(max_length=64, default="", verbose_name="sha256 hash for easier comparisons")
-
- center = models.CharField(max_length=20, verbose_name="center")
-
- wall1 = models.BooleanField(verbose_name="wall 1 (top)")
- wall2 = models.BooleanField(verbose_name="wall 2 (top-right)")
- wall3 = models.BooleanField(verbose_name="wall 3 (bottom-right)")
- wall4 = models.BooleanField(verbose_name="wall 4 (bottom)")
- wall5 = models.BooleanField(verbose_name="wall 5 (bottom-left)")
- wall6 = models.BooleanField(verbose_name="wall 6 (top-left)")
-
- link1 = models.CharField(max_length=7, verbose_name="link 1 (top)")
- link2 = models.CharField(max_length=7, verbose_name="link 2 (top-right)")
- link3 = models.CharField(max_length=7, verbose_name="link 3 (bottom-right)")
- link4 = models.CharField(max_length=7, verbose_name="link 4 (bottom)")
- link5 = models.CharField(max_length=7, verbose_name="link 5 (bottom-left)")
- link6 = models.CharField(max_length=7, verbose_name="link 6 (top-left)")
-
- def copyFromTranscription(self, transcription):
- self.puzzlePiece = transcription.puzzlePiece
- self.center = transcription.center
- self.wall1 = transcription.wall1
- self.wall2 = transcription.wall2
- self.wall3 = transcription.wall3
- self.wall4 = transcription.wall4
- self.wall5 = transcription.wall5
- self.wall6 = transcription.wall6
- self.link1 = transcription.link1
- self.link2 = transcription.link2
- self.link3 = transcription.link3
- self.link4 = transcription.link4
- self.link5 = transcription.link5
- self.link6 = transcription.link6
+ # Metadata
+ puzzle_piece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="transcriptions", null=True, default=None)
+ submitter = models.CharField(max_length=64, verbose_name="Hash of submitter ip address", default="", blank=True)
+ submission_date = models.DateTimeField(verbose_name="Submission date", auto_now_add=True)
+ # Flags
+ bad_flag = models.BooleanField(verbose_name="Image is bad or hard to read", default=False)
+ rotation_flag = models.BooleanField(verbose_name="Image is rotated", default="")
+ # Center in short notation
+ center = models.CharField(max_length=1, verbose_name="center", default="")
+ # Walls
+ wall1 = models.BooleanField(verbose_name="Wall 1 (top)", default=False)
+ wall2 = models.BooleanField(verbose_name="Wall 2 (top-right)", default=False)
+ wall3 = models.BooleanField(verbose_name="Wall 3 (bottom-right)", default=False)
+ wall4 = models.BooleanField(verbose_name="Wall 4 (bottom)", default=False)
+ wall5 = models.BooleanField(verbose_name="Wall 5 (bottom-left)", default=False)
+ wall6 = models.BooleanField(verbose_name="Wall 6 (top-left)", default=False)
+ # Links
+ link1 = models.CharField(max_length=7, verbose_name="Link 1 (top)", default="")
+ link2 = models.CharField(max_length=7, verbose_name="Link 2 (top-right)", default="")
+ link3 = models.CharField(max_length=7, verbose_name="Link 3 (bottom-right)", default="")
+ link4 = models.CharField(max_length=7, verbose_name="Link 4 (bottom)", default="")
+ link5 = models.CharField(max_length=7, verbose_name="Link 5 (bottom-left)", default="")
+ link6 = models.CharField(max_length=7, verbose_name="Link 6 (top-left)", default="")
+ # SHA256 hash of 'center wallsAsBitstring link1 link2 link3 link4 link5 link6' for comparison
+ hash = models.CharField(max_length=64, verbose_name="SHA256 hash for comparison", default="", null=True, blank=True)
+
+ def save(self, *args, **kwargs):
+ self.sanitize_fields()
+ if self.bad_flag:
+ self.hash = None
+ else:
+ self.hash = self.calculate_hash()
+ super().save(*args, **kwargs)
+
+ def calculate_hash(self):
+ hashInput = (
+ f"{self.center} "
+ f"{str(int(self.wall1))}{str(int(self.wall2))}{str(int(self.wall3))}"
+ f"{str(int(self.wall4))}{str(int(self.wall5))}{str(int(self.wall6))} "
+ f"{self.link1} {self.link2} {self.link3} {self.link4} {self.link5} {self.link6}"
+ ).encode("utf-8")
+ return sha256(hashInput).hexdigest()
+
+ def sanitize_fields(self):
+ fields = [ "center", "link1", "link2", "link3", "link4", "link5", "link6" ]
+ for field in fields:
+ self.__setattr__(field, self.__getattribute__(field).upper())
diff --git a/src/collector/urls.py b/src/collector/urls.py
index 089be9d..58f9d3e 100644
--- a/src/collector/urls.py
+++ b/src/collector/urls.py
@@ -5,23 +5,20 @@
api_router = routers.DefaultRouter()
api_router.register(r'pieces', views.PuzzlePieceViewSet)
-api_router.register(r'transcriptions', views.TranscriptionDataViewSet)
+api_router.register(r'transcriptions', views.TranscriptionViewSet)
urlpatterns = [
path("", views.index, name="index"),
path('api/', include(api_router.urls)),
- path("puzzlepieces/submit", views.puzzlepieceSubmit, name="puzzlepieceSubmit"),
- path("puzzlepieces/", views.PuzzlepieceIndex.as_view(), name="puzzlepieceIndex"),
- path("puzzlepieces//", views.puzzlepieceView, name="puzzlepieceView"),
+ path("puzzlepieces/submit", views.submit_puzzle_piece, name="puzzlepieceSubmit"),
+ path("puzzlepieces/", views.PuzzlePieceIndex.as_view(), name="puzzlepieceIndex"),
+ path("puzzlepieces//", views.puzzle_piece_view, name="puzzlepieceView"),
path("transcriptions", views.TranscriptionsIndex.as_view(), name="transcriptions"),
- path("transcriptions/", views.transcriptionsDetail, name="transcriptionsDetail"),
+ path("transcriptions/", views.transcriptions_detail, name="transcriptionsDetail"),
path("transcriptions/guide", views.transcriptionGuide, name="transcriptionGuide"),
path("transcribe", views.TranscribeIndex.as_view(), name="transcribe"),
- path("transcribe/", views.processTranscription, name="transcribeResults"),
- path("confidence", views.ConfidenceIndex.as_view(), name="confidenceIndex"),
- path("confidence/", views.confidenceDetail, name="confidenceDetail"),
+ path("transcribe/", views.process_transcription, name="transcribeResults"),
path("solutions", views.ConfidenceSolutionIndex.as_view(), name="confidenceSolutionIndex"),
- path("solutions/", views.confidenceSolutionDetail, name="confidenceSolutionDetail"),
path("export/verified/csv", views.exportVerifiedCSV, name="exportVerifiedCSV"),
path("export/pieces/csv", views.exportPiecesCSV, name="exportPiecesCSV"),
path("export/transcriptions/csv", views.exportTranscriptionsCSV, name="exportTranscriptionsCSV"),
diff --git a/src/collector/utility.py b/src/collector/utility.py
new file mode 100644
index 0000000..cb7ca5c
--- /dev/null
+++ b/src/collector/utility.py
@@ -0,0 +1,33 @@
+import hmac
+from urllib.parse import urlparse
+from hashlib import sha256
+from django.conf import settings
+
+def get_client_ip(request):
+ try:
+ x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR")
+ if x_forwarded_for:
+ ip = x_forwarded_for.split(",")[0]
+ else:
+ ip = request.META.get("REMOTE_ADDR")
+ return ip
+ except Exception:
+ return None
+
+def get_dict_value(dict, key, default):
+ if dict and key in dict:
+ return dict[key]
+ return default
+
+# When exporting data, we shouldn't really make hash(ip) public because it's
+# too easy to reverse. Use HMAC with SECRET_KEY as a keyed hash, to prevent
+# reversing while still being usable as a unique identifier within a single
+# exported set of data
+def disguise_client_ip(ip):
+ key = settings.SECRET_KEY.encode("utf-8")
+ encoded_ip = ip.encode("utf-8")
+ return hmac.new(key, encoded_ip, sha256).hexdigest()
+
+def is_image_url(url):
+ file_ext = url.path.lower().split('.')[-1]
+ return file_ext in [ "jpg", "jpeg", "png" ]
diff --git a/src/collector/views.py b/src/collector/views.py
index 0f4245f..26acf2e 100644
--- a/src/collector/views.py
+++ b/src/collector/views.py
@@ -1,3 +1,9 @@
+from urllib.parse import urlparse
+import csv
+import requests
+import re
+import json
+
from django.http import HttpResponse
from django.http import Http404
from django.template import loader
@@ -7,114 +13,78 @@
from django.db.models import Count, F, Max
from django.utils.decorators import method_decorator
from django.conf import settings
-from .models import *
+from rest_framework.decorators import action
+from rest_framework.response import Response
+from rest_framework import mixins, status, viewsets
+
+from .models import PuzzlePiece, Transcription
from .serializers import (
PuzzlePieceSerializer,
TranscriptionDataSerializer,
BadImageSerializer,
ConfidentSolutionSerializer,
)
-import json
-#from django.db import transaction
-from . import UtilityOps as UtilityOps
-from urllib.parse import urlparse
-from random import randint
-import csv
-import hashlib
-import hmac
-import requests
-import re
-from rest_framework.decorators import action
-from rest_framework.response import Response
-from rest_framework import mixins, status, viewsets
+from .utility import *
+
+# Settings for the confidence and image distribution logic
+DEFAULT_PUZZLE_PIECE_PRIORITY = 10
+IMAGE_POOL_SIZE = 100
+CONFIDENCE_RATIO = 80
+MIN_TRANSCRIPTIONS = 3
+MIN_ROTATED_TRANSCRIPTIONS = 5
+BAD_FLAG_THRESHOLD = 2
# Regex pattern for text submissions
-textSubmissionPattern = re.compile(r"^\s*(?P(Blank|Plus|Clover|Hex|Snake|Diamond|Cauldron|B|P|C|H|S|D|T))\s*(?P([1-6])(\s*,\s*[1-6]){0,5})(?P(\s*[BPCHSDT]{7}){6})\s*$", re.IGNORECASE)
-
-def hash_my_data(url):
- url = url.encode("utf-8")
- hash_object = hashlib.sha256(url)
- hex_dig = hash_object.hexdigest()
- return hex_dig
-
-# When exporting data, we shouldn't really make hash(ip) public because it's
-# too easy to reverse. Use HMAC with SECRET_KEY as a keyed hash, to prevent
-# reversing while still being usable as a unique identifier within a single
-# exported set of data
-def secretly_hash_my_data(data):
- key = settings.SECRET_KEY.encode("utf-8")
- data = data.encode("utf-8")
- hash_object = hmac.new(key, data, hashlib.sha256)
- hex_dig = hash_object.hexdigest()
- return hex_dig
-
-def findImage(url):
- host = urlparse(url).hostname
- if host in ["imgur.com"]:
- # Can we be clever and figure out an Imgur URL on the fly?
- turl = "https://i.imgur.com" + urlparse(url).path + ".png"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- turl = "https://i.imgur.com" + urlparse(url).path + ".jpg"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- turl = "https://i.imgur.com" + urlparse(url).path + ".jpeg"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- return None
- elif host in ["gyazo.com"]:
- turl = "https://i.gyazo.com" + urlparse(url).path + ".png"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- turl = "https://i.gyazo.com" + urlparse(url).path + ".jpg"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- turl = "https://i.gyazo.com" + urlparse(url).path + ".jpeg"
- res = requests.head(turl)
- if res.status_code == 200:
- return turl
- return None
- else:
- return None
+TEXT_SUBMISSION_PATTERN = re.compile(r"^\s*(?P(Blank|Plus|Clover|Hex|Snake|Diamond|Cauldron|B|P|C|H|S|D|T))\s*(?P([1-6])(\s*,\s*[1-6]){0,5})(?P(\s*[BPCHSDT]{7}){6})\s*$", re.IGNORECASE)
+
+# Image url patterns used in find_image
+IMAGE_URL_PATTERNS = ({
+ "imgur.com": [ "https://i.imgur.com/{}.png", "https://i.imgur.com/{}.jpg", "https://i.imgur.com/{}.jpeg" ],
+ "gyazo.com": [ "https://i.gyazo.com/{}.png", "https://i.gyazo.com/{}.jpg", "https://i.imgur.com/{}.jpeg" ]
+})
+IMAGE_URL_WHITELIST = ["cdn.discordapp.com", "media.discordapp.net", "i.gyazo.com", "gyazo.com", "i.imgur.com", "imgur.com"]
+
+def find_image(url):
+ host = url.hostname
+ if host in IMAGE_URL_PATTERNS.keys():
+ for url_pattern in IMAGE_URL_PATTERNS[host]:
+ target_url = url_pattern.format(url.path)
+ res = requests.head(target_url)
+ if res.status_code == 200:
+ return target_url
+ return None
-def findUnconfidentPuzzlePieces(self):
+def find_unconfident_puzzle_piece(request):
+ client_identifier = disguise_client_ip(get_client_ip(request))
# We want to order by transCount descending to get faster results. We do not show anything definitely flagged as bad; that already has been solved
- # Allow multiple transcriptions by one person - at the current load the database query is just to expensive
- result = PuzzlePiece.objects.raw("""
+ result = PuzzlePiece.objects.raw(f"""
SELECT * FROM
- (SELECT * FROM collector_puzzlepiece WHERE
- id NOT IN (SELECT puzzlePiece_id FROM collector_confidentsolution) AND
- id NOT IN (SELECT puzzlePiece_id FROM collector_badimage)
- ORDER BY priority DESC, transCount DESC
- LIMIT 100
- ) AS current_transcriptions
+ (SELECT * FROM collector_puzzlepiece pp WHERE
+ confidence < {CONFIDENCE_RATIO} AND
+ (SELECT SUM(t.bad_flag) FROM collector_transcription t WHERE t.puzzle_piece_id = pp.id) < {BAD_FLAG_THRESHOLD} AND
+ NOT EXISTS (SELECT t.id FROM collector_transcription t WHERE t.puzzle_piece_id = t.id AND t.submitter = {client_identifier})
+ ORDER BY pp.priority DESC
+ LIMIT {IMAGE_POOL_SIZE}
+ ) ct
ORDER BY RAND()
LIMIT 1
""")
- # Want less than a certain confidence.
- # X or more "bad image" records will disqualify from showing up again.
-
- if len(result) > 0:
- # Add an isImage that we'll reference in the template, this allows us to handle generic links
- parsedUrl = urlparse(result[0].url)
- if parsedUrl.path.lower().endswith(".jpg") or parsedUrl.path.lower().endswith(".png") or parsedUrl.path.lower().endswith(".jpeg"):
- result[0].isImage = True
- else:
- result[0].isImage = False
- # Warn if rotateod
- rotated = PuzzlePiece.objects.raw('SELECT id FROM collector_rotatedimage WHERE puzzlePiece_id = ' + str(result[0].id))
- if rotated:
- result[0].isRotated = True
- else:
- result[0].isRotated = False
- return result[0]
- return None
+ if len(result) == 0:
+ return None
+ result = result[0]
+
+ # Add an is_image that we'll reference in the template, this allows us to handle generic links
+ result.is_image = is_image_url(urlparse(result.url))
+
+ # Warn if rotated
+ rotated = PuzzlePiece.objects.raw(f"""
+ SELECT COUNT(id) FROM collector_transcription WHERE
+ puzzle_piece_id = {result.id} AND
+ rotation_flag = 1
+ """)
+ result.is_rotated = rotated > 0
+ return result
@cache_page(60 * 60)
def index(request):
@@ -126,39 +96,37 @@ def transcriptionGuide(request):
template = loader.get_template("collector/transcriptionGuide.html")
return HttpResponse(template.render(None, request))
-def puzzlepieceSubmit(request):
+def submit_puzzle_piece(request):
responseMessage = None
responseMessageSuccess = None
- # New submissions are first in queue, but behind specific streamer requests
- priority = 10
try:
if request.method == "POST":
url = request.POST["url"].strip()
if len(url) > 200:
- raise ValueError('You havin\' a laff, mate? A URL that long? Yeah no.')
- host = urlparse(url).hostname
- if host not in ["cdn.discordapp.com", "media.discordapp.net", "i.gyazo.com", "gyazo.com", "i.imgur.com", "imgur.com"]:
- raise ValueError('We only accept images from cdn.discordapp.com, media.discordapp.net, (i.)gyazo.com and (i.)imgur.com right now.')
- if host not in ["gyazo.com", "imgur.com"] and not (url.lower().endswith(".jpg") or url.lower().endswith(".png") or url.lower().endswith(".jpeg")):
- raise ValueError('Please make sure your link ends with .jpg or .jpeg or .png. Direct links to images work best with our current site.')
- if host in ["gyazo.com", "imgur.com"]:
- turl = findImage(url)
- if turl:
- url = turl
- if url.find("http",8,len(url)) != -1:
- raise ValueError('Found http in the middle of the URL - did you paste it twice?' + url)
+ raise ValueError("You havin\' a laff, mate? A URL that long? Yeah no.")
+ parsed_url = urlparse(url)
+ if parsed_url.hostname not in IMAGE_URL_WHITELIST:
+ raise ValueError(f"We only accept images from {', '.join(IMAGE_URL_WHITELIST)} right now.")
+ # Try to convert to an image url if not already present
+ if not is_image_url(parsed_url):
+ target_url = find_image(parsed_url)
+ if target_url:
+ url = target_url
+ else:
+ raise ValueError('Please make sure your link ends with .jpg or .jpeg or .png. Direct links to images work best with our current site.')
+ # Check if image is reachable
res = requests.head(url)
if res.status_code != 200:
raise ValueError(url + ' -- That URL does not seem to exist. Please verify and try again.')
- newPiece = PuzzlePiece()
- newPiece.url = url
- newPiece.hash = hash_my_data(url)
+ new_piece = PuzzlePiece()
+ new_piece.url = url
# An IP is personal data as per GDPR, kid you not. Let's hash it, we just need something unique
- newPiece.ip_address = hash_my_data(UtilityOps.UtilityOps.GetClientIP(request))
- newPiece.priority = priority
- newPiece.save()
+ new_piece.submitter = disguise_client_ip(get_client_ip(request))
+ new_piece.priority = DEFAULT_PUZZLE_PIECE_PRIORITY
+ new_piece.save()
+
responseMessageSuccess = "Puzzle Piece image submitted successfully!"
except KeyError as ex:
responseMessage = "There was an issue with your request. Please try again?"
@@ -177,178 +145,109 @@ def puzzlepieceSubmit(request):
}
return HttpResponse(template.render(context, request))
-@method_decorator(cache_page(5 * 60), name='dispatch')
-class PuzzlepieceIndex(generic.ListView):
+@method_decorator(cache_page(60), name='dispatch')
+class PuzzlePieceIndex(generic.ListView):
template_name = 'collector/latest.html'
context_object_name = 'latest'
def get_queryset(self):
return PuzzlePiece.objects.order_by("-submitted_date")[:50]
-
-def puzzlepieceView(request, image_id):
+def puzzle_piece_view(request, image_id):
piece = get_object_or_404(PuzzlePiece, pk=image_id)
- if len(piece.hash) == 0 or "empty" == str(piece.hash).lower():
- piece.hash = hash_my_data(piece.url)
- piece.save()
-
- #transcriptions = TranscriptionData.objects.filter(puzzlePiece_id=image_id)
-
context = {
- "puzzlepiece": piece,
-
+ "puzzlepiece": piece
}
- #"transcriptions": transcriptions
return render(request, 'collector/puzzlepieceDetail.html', context)
-
class TranscribeIndex(generic.ListView):
template_name = 'collector/transcribe.html'
context_object_name = 'puzzlepiece'
def get_queryset(self):
- return findUnconfidentPuzzlePieces(self)
-
+ return find_unconfident_puzzle_piece(self.request)
-def processTranscription(request, puzzlepiece_id):
+def process_transcription(request, puzzlepiece_id):
data = None
errors = None
- transcriptData = None
+ transcript_data = None
if request.method == "POST":
- if "bad_image" in request.POST:
- bad_image = request.POST["bad_image"]
- else:
- bad_image = False
- if "rotated_image" in request.POST:
- rotated_image = request.POST["rotated_image"]
- else:
- rotated_image = False
+ is_bad_image = request.POST["bad_image"] if "bad_image" in request.POST else False
+ is_rotated_image = request.POST["rotated_image"] if "rotated_image" in request.POST else False
data = request.POST["data"]
try:
data = json.loads(data)
except Exception:
- match = textSubmissionPattern.match(data)
- if match:
- data = parse_data_string(match)
- else:
- data = None
+ # If parsing the data fails we know that the data might be provided as plain text in the known format
+ match = TEXT_SUBMISSION_PATTERN.match(data)
+ data = parse_plain_data(match) if match else None
puzzlePiece = get_object_or_404(PuzzlePiece, pk=puzzlepiece_id)
# Hash IP bcs of GDPR
- client_ip_address = hash_my_data(UtilityOps.UtilityOps.GetClientIP(request))
- errors, transcriptData = processTransscriptionData(data, bad_image, rotated_image, puzzlePiece, client_ip_address)
+ client_identifier = disguise_client_ip(get_client_ip(request))
+ errors, transcript_data = process_transcription_data(data, is_bad_image, is_rotated_image, puzzlePiece, client_identifier)
determineConfidence(puzzlepiece_id)
context = {
"data": data,
"errors": errors,
- "transcript": transcriptData
+ "transcript": transcript_data
}
return render(request, "collector/transcribeResults.html", context=context)
-
-
-def parse_data_string(matchedData):
+def parse_plain_data(matched_data):
data_dict = {}
# Sometimes the center is fully written out. Other times it is not.
# This allows for both.
- data_dict["center"] = "T" if matchedData.group('center') == "Cauldron" else matchedData.group('center').upper()[0]
+ data_dict["center"] = "T" if matched_data.group('center') == "Cauldron" else matched_data.group('center').upper()[0]
# Wall list of length 6. Default is wall true, since string contains list of
# openings.
data_dict["walls"] = [True] * 6
- for opening in matchedData.group('sides').split(","):
+ for opening in matched_data.group('sides').split(","):
data_dict["walls"][int(opening) - 1] = False
# Node list. Split string into list of strings, then split each side
# into a list of characters.
data_dict["nodes"] = []
- side_list = matchedData.group('links').split()
+ side_list = matched_data.group('links').split()
for side in side_list:
data_dict["nodes"].append(list(side.upper()))
return data_dict
-
-
-def processTransscriptionData(rawData, bad_image, rotated_image, puzzlePiece, client_ip_address):
- if bad_image and bool(bad_image) == True:
- transcriptData = TranscriptionData()
- transcriptData.ip_address = client_ip_address
- transcriptData.puzzlePiece = puzzlePiece
- transcriptData.bad_image = True
- transcriptData.datahash = "badimage"
-
- transcriptData.center = ""
- transcriptData.wall1 = False
- transcriptData.wall2 = False
- transcriptData.wall3 = False
- transcriptData.wall4 = False
- transcriptData.wall5 = False
- transcriptData.wall6 = False
- transcriptData.link1 = ""
- transcriptData.link2 = ""
- transcriptData.link3 = ""
- transcriptData.link4 = ""
- transcriptData.link5 = ""
- transcriptData.link6 = ""
- if rotated_image and bool(rotated_image) == True:
- transcriptData.orientation = "wrong"
-
- transcriptData.save()
- return [], transcriptData
-
- center = UtilityOps.UtilityOps.GetDictValues(rawData, "center", None)
- walls = UtilityOps.UtilityOps.GetDictValues(rawData, "walls", None)
- edges = UtilityOps.UtilityOps.GetDictValues(rawData, "nodes", None)
-
+def process_transcription_data(raw_transcription, is_bad, is_rotated, puzzle_piece, client_identifier):
+ transcriptData = Transcription()
+ transcriptData.submitter = client_identifier
+ transcriptData.puzzle_piece = puzzle_piece
+ transcriptData.bad_flag = is_bad
+ transcriptData.rotation_flag = is_rotated
errors = []
- if not center:
- errors.append("No center value was found in the JSON. This is required.")
- if walls and len(walls) != 6:
- errors.append("There should be 6 walls in the JSON. {} were found.".format(len(walls)))
- if edges and len(edges) != 6:
- errors.append("There should be 6 edges/nodes in the JSON. {} were found.".format(len(edges)))
-
- transcriptData = None
- if len(errors) == 0:
- # Prepare the Transcription Data
- transcriptData = TranscriptionData()
- transcriptData.bad_image = False
- transcriptData.ip_address = client_ip_address
- transcriptData.puzzlePiece = puzzlePiece
- transcriptData.center = center
-
- transcriptData.wall1 = walls[0]
- transcriptData.wall2 = walls[1]
- transcriptData.wall3 = walls[2]
- transcriptData.wall4 = walls[3]
- transcriptData.wall5 = walls[4]
- transcriptData.wall6 = walls[5]
-
- linkJoiner = ""
- transcriptData.link1 = linkJoiner.join(edges[0])
- transcriptData.link2 = linkJoiner.join(edges[1])
- transcriptData.link3 = linkJoiner.join(edges[2])
- transcriptData.link4 = linkJoiner.join(edges[3])
- transcriptData.link5 = linkJoiner.join(edges[4])
- transcriptData.link6 = linkJoiner.join(edges[5])
- hashStr = center + ' ' + str(1 if walls[0] == True else 0) + str(1 if walls[1] == True else 0) + str(1 if walls[3] == True else 0) + \
- str(1 if walls[4] == True else 0) + str(1 if walls[4] == True else 0) + str(1 if walls[5] == True else 0) + ' ' + \
- transcriptData.link1 + ' ' + transcriptData.link2 + ' ' + transcriptData.link3 + ' ' + \
- transcriptData.link4 + ' ' + transcriptData.link5 + ' ' + transcriptData.link6
-
- transcriptData.datahash = hash_my_data(hashStr.upper())
- if rotated_image and bool(rotated_image) == True: transcriptData.orientation = "wrong"
-
- transcriptData.save()
+ # If the image is marked as bad we can skip parsing the transcription and stick to the default values
+ if not transcriptData.bad_flag:
+ # Parse data from raw_transcription
+ center = get_dict_value(raw_transcription, "center", None)
+ walls = get_dict_value(raw_transcription, "walls", None)
+ edges = get_dict_value(raw_transcription, "nodes", None)
+ if not center:
+ errors.append("No center value was found in the JSON. This is required.")
+ if walls and len(walls) != 6:
+ errors.append("There should be 6 walls in the JSON. {} were found.".format(len(walls)))
+ if edges and len(edges) != 6:
+ errors.append("There should be 6 edges/nodes in the JSON. {} were found.".format(len(edges)))
+ # Set parsed data
+ transcriptData.center = center
+ for i in range(6):
+ transcriptData.__setattr__(f"wall{str(i + 1)}", walls[i])
+ transcriptData.__setattr__(f"link{str(i + 1)}", edges[i])
+ transcriptData.save()
return errors, transcriptData
-@method_decorator(cache_page(5 * 60), name='dispatch')
+@method_decorator(cache_page(60), name='dispatch')
class TranscriptionsIndex(generic.ListView):
template_name = 'collector/transcriptions.html'
context_object_name = 'latest'
@@ -356,264 +255,82 @@ class TranscriptionsIndex(generic.ListView):
def get_queryset(self):
return TranscriptionData.objects.order_by("-submitted_date")[:50]
-
-def transcriptionsDetail(request, transcription_id):
- transcription = get_object_or_404(TranscriptionData, pk=transcription_id)
-
- if not transcription.datahash:
- pass
-
+def transcriptions_detail(request, transcription_id):
+ transcription = get_object_or_404(Transcription, pk=transcription_id)
context = {
"transcription": transcription,
- "puzzlepiece": transcription.puzzlePiece
+ "puzzlepiece": transcription.puzzle_piece
}
return render(request, 'collector/transcriptionDetail.html', context)
-
-@method_decorator(cache_page(5 * 60), name='dispatch')
-class ConfidenceIndex(generic.ListView):
- model = ConfidenceTracking
- template_name = 'collector/confidenceIndex.html'
- context_object_name = 'latest'
-
-def confidenceDetail(request, confidence_id):
- confidence = get_object_or_404(ConfidenceTracking, pk=confidence_id)
-
- if request.method == "POST":
- if "rerun" in request.POST:
- print("rerun")
- determineConfidence(confidence.puzzlePiece.id)
- confidence = get_object_or_404(ConfidenceTracking, pk=confidence_id)
-
- context = {
- "confidence": confidence,
- }
- return render(request, 'collector/confidenceDetail.html', context)
-
-
-def determineConfidence(puzzlepieceId):
- data = TranscriptionData.objects.filter(puzzlePiece_id=puzzlepieceId)
+def determine_confidence(puzzle_piece_id):
+ data = Transcription.objects.filter(puzzle_piece_id=puzzle_piece_id)
hashes = {}
- confidenceRatio = 80
- rotatedConfidenceRatio = 90
- confidenceThreshold = 0 # We set this programmatically later
- minSubmissions = 10
- rotatedMinSubmissions = 15
- badCount = 0
- badThreshold = 4
- rotationCount = 0
- totalCount = len(data)
- updateTransCount(puzzlepieceId,totalCount)
-
- # Track bad images
+ bad_count = 0
+ rotation_count = 0
+
+ # Count transcriptions where the image had been marked as bad
for d in data:
if d.bad_image:
- badCount += 1
- continue
-
- if badCount >= badThreshold:
- setOrUpdateBadImage(puzzlepieceId, badCount)
+ bad_count += 1
+ # Stop calculation when BAD_FLAG_THRESHOLD is reached
+ if bad_count >= BAD_FLAG_THRESHOLD:
return
- # Track rotated images
+ # Count transcription with reported image rotation
for d in data:
- if d.orientation == "wrong":
- rotationCount += 1
- continue
-
- if rotationCount > 0:
- setOrUpdateRotatedImage(puzzlepieceId, rotationCount)
-
- # Adjust totalCount, we will exclude bad Image submissions
- for d in data:
- if d.bad_image:
- totalCount -= 1
- continue
+ if d.rotation_flag:
+ rotation_count += 1
+ # Adjust valid_transcriptions, we will exclude bad image submissions from now on
+ valid_transcriptions = len(data) - bad_count
# Is there enough data to determine a confidence level?
# If no, create or update a tracker entry.
- if not rotationCount and totalCount < minSubmissions:
- tracker = setOrUpdateConfidenceTracking(puzzlepieceId, totalCount)
- return
- elif rotationCount and totalCount < rotatedMinSubmissions:
- tracker = setOrUpdateConfidenceTracking(puzzlepieceId, totalCount)
+ if (rotation_count == 0 and valid_transcriptions < MIN_TRANSCRIPTIONS) or (rotation_count > 0 and valid_transcriptions < MIN_ROTATED_TRANSCRIPTIONS):
return
for d in data:
- if d.datahash not in hashes:
- hashes[d.datahash] = 0
- hashes[d.datahash] = hashes[d.datahash] + 1
- # solution confidence threshold is...
- if not rotationCount:
- confidenceThreshold = confidenceRatio
- else:
- confidenceThreshold = rotatedConfidenceRatio
-
- biggest = 0
- biggesthash = None
- if hashes:
- for hash, count in hashes.items():
- if count > biggest:
- biggest = count
- biggesthash = hash
-
- confidence = (biggest / totalCount) * 100
- # Update the confidence...
- tracker = setOrUpdateConfidenceTracking(puzzlepieceId, confidence)
-
- if confidence >= confidenceThreshold:
- # find the first transcription data object with the hash...
- for d in data:
- if d.datahash == biggesthash:
- setOrUpdateConfidenceSolution(puzzlepieceId, confidence, d.id)
- break
-
-
-def updateTransCount(puzzlepieceId, transCount):
- try:
- piece = PuzzlePiece.objects.get(id=puzzlepieceId)
- piece = PuzzlePiece.objects.filter(id=piece.id).update(transCount=transCount)
- return piece
- except Exception as ex:
- piece = None
+ if d.hash not in hashes:
+ hashes[d.hash] = 0
+ hashes[d.hash] += 1
-def setOrUpdateBadImage(puzzlepieceId, badCount):
- try:
- bad = BadImage.objects.get(puzzlePiece_id=puzzlepieceId)
- bad = BadImage.objects.filter(id=bad.id).update(badCount=badCount)
- return bad
- except Exception as ex:
- bad = None
+ most_common_hash_count = 0
+ for _, count in hashes.items():
+ if count > most_common_hash_count:
+ most_common_hash_count = count
- bad = BadImage()
- bad.puzzlePiece = get_object_or_404(PuzzlePiece, pk=puzzlepieceId)
- bad.badCount = badCount
- bad.save()
- return bad
+ confidence = (most_common_hash_count / valid_transcriptions) * 100
+ # Update the confidence on the puzzle piece
+ upsert_confidence(puzzle_piece_id, confidence)
-def setOrUpdateRotatedImage(puzzlepieceId, rotationCount):
- try:
- rotated = RotatedImage.objects.get(puzzlePiece_id=puzzlepieceId)
- rotated = RotatedImage.objects.filter(id=rotated.id).update(rotatedCount=rotationCount)
- return rotated
- except Exception as ex:
- rotated = None
+def upsert_confidence(puzzle_piece_id, confidence):
+ puzzle_piece = get_object_or_404(PuzzlePiece, pk=puzzle_piece_id)
+ puzzle_piece.confidence = confidence
+ puzzle_piece.save()
- rotated = RotatedImage()
- rotated.puzzlePiece = get_object_or_404(PuzzlePiece, pk=puzzlepieceId)
- rotated.rotatedCount = rotationCount
- rotated.save()
- return rotated
-
-def setOrUpdateConfidenceTracking(puzzlepieceId, confidence):
- try:
- tracker = ConfidenceTracking.objects.get(puzzlePiece_id=puzzlepieceId)
- tracker = ConfidenceTracking.objects.filter(id=tracker.id).update(confidence=confidence)
- return tracker
- except Exception as ex:
- tracker = None
-
- tracker = ConfidenceTracking()
- tracker.puzzlePiece = get_object_or_404(PuzzlePiece, pk=puzzlepieceId)
- tracker.confidence = confidence
- tracker.save()
- return tracker
-
-def setOrUpdateConfidenceSolution(puzzlepieceId, confidence, transcriptiondataId):
- try:
- solution = ConfidentSolution.objects.get(puzzlePiece_id=puzzlepieceId)
- solution = ConfidentSolution.objects.filter(id=solution.id).update(
- confidence=confidence
- )
- return solution
- except Exception as ex:
- solution = None
-
-
- solution = ConfidentSolution()
-
- transcription = TranscriptionData.objects.get(id=transcriptiondataId)
- print("looking for {} and found {}".format(transcriptiondataId, transcription.id))
- solution.center = transcription.center
- solution.wall1 = transcription.wall1
- solution.wall2 = transcription.wall2
- solution.wall3 = transcription.wall3
- solution.wall4 = transcription.wall4
- solution.wall5 = transcription.wall5
- solution.wall6 = transcription.wall6
-
- solution.link1 = transcription.link1
- solution.link2 = transcription.link2
- solution.link3 = transcription.link3
- solution.link4 = transcription.link4
- solution.link5 = transcription.link5
- solution.link6 = transcription.link6
-
- solution.datahash = transcription.datahash
- solution.puzzlePiece = get_object_or_404(PuzzlePiece, pk=puzzlepieceId)
-
- solution.confidence = confidence
- solution.save()
-
- return solution
-
-@method_decorator(cache_page(5 * 60), name='dispatch')
+@method_decorator(cache_page(60), name='dispatch')
class ConfidenceSolutionIndex(generic.ListView):
- model = ConfidentSolution
template_name = 'collector/confidenceSolutionIndex.html'
context_object_name = "collection"
-
-def confidenceSolutionDetail(request, solution_id):
- solution = get_object_or_404(ConfidentSolution, pk=solution_id)
-
- context = {
- "solution": solution,
- }
- return render(request, 'collector/confidenceSolutionDetail.html', context)
+ def get_queryset(self):
+ return PuzzlePiece.objects.filter(confidence__gte=CONFIDENCE_RATIO)
class PuzzlePieceViewSet(viewsets.ReadOnlyModelViewSet):
- # annotate badimages count for serializer performance
- queryset = PuzzlePiece.objects.all().annotate(
- badimage_count=Max('badimages__badCount'),
- )
- serializer_class = PuzzlePieceSerializer
-
- @action(detail=False)
- def get_random(self, request):
- qs = PuzzlePiece.objects.annotate(
- transcription_count=Count('transcriptions'),
- ).filter(transcription_count__lt=5)
- count = qs.aggregate(count=Count('pk'))['count']
- print(count)
- if count < 1:
- return Response({}, status=status.HTTP_404_NOT_FOUND)
- random_index = randint(0, count - 1)
- rando = qs[random_index]
- serializer = self.get_serializer(rando)
- return Response(serializer.data)
-
- @action(detail=True, methods=['post'])
- def report(self, request, *args, **kwargs):
- piece = self.get_object()
- if piece.badimages.count() > 0:
- # atomic increment of the count on all existing BadImages
- piece.badimages.update(badCount=F('badCount')+1)
- else:
- # create a BadImage... might have a race condition :(
- bad = BadImage(puzzlePiece=piece, badCount=1)
- bad.save()
-
- # go ahead and return the updated piece
- piece = self.get_object()
- serializer = self.get_serializer(piece)
- return Response(serializer.data)
-
-
-class TranscriptionDataViewSet(viewsets.GenericViewSet, mixins.CreateModelMixin):
- queryset = TranscriptionData.objects.all()
+ # annotate badimages count for serializer performance
+ queryset = PuzzlePiece.objects.all()
+ serializer_class = PuzzlePieceSerializer
+
+ @action(detail=False)
+ def get_random(self, request):
+ unconfident_piece = find_unconfident_puzzle_piece(request)
+ serializer = self.get_serializer(unconfident_piece)
+ return Response(serializer.data)
+
+class TranscriptionViewSet(viewsets.GenericViewSet, mixins.CreateModelMixin):
+ queryset = Transcription.objects.all()
serializer_class = TranscriptionDataSerializer
# copied code from the mixin, but we need access to request here
@@ -621,19 +338,9 @@ def create(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
- x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
- if x_forwarded_for:
- ip = x_forwarded_for.split(',')[0]
- else:
- ip = request.META.get('REMOTE_ADDR')
-
- # TODO: ALSO MAKE HASH
-
- # ip_address in kwargs here *should* put it in?
- serializer.save(ip_address=ip)
-
+ client_identifier = disguise_client_ip(get_client_ip(request))
+ serializer.save(submitter=client_identifier)
headers = self.get_success_headers(serializer.data)
-
return Response(serializer.data, status=status.HTTP_201_CREATED, headers=headers)
@cache_page(60 * 5)
@@ -685,7 +392,7 @@ def exportVerifiedCSV(request):
return response
-@cache_page(60 * 10)
+@cache_page(60 * 5)
def exportPiecesCSV(request):
response = HttpResponse(content_type = 'text/csv')
response['Content-Disposition'] = 'attachment; filename="imgurls.csv"'