Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE: Major code rework #8

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions src/collector/UtilityOps.py

This file was deleted.

176 changes: 60 additions & 116 deletions src/collector/models.py
Original file line number Diff line number Diff line change
@@ -1,127 +1,71 @@
from django.db import models

from hashlib import sha256

class PuzzlePiece(models.Model):
url = models.URLField(verbose_name="image url")
hash = models.CharField(max_length=64, unique=True, default="empty", verbose_name="sha256 hash of the url")
ip_address = models.CharField(max_length=64, default="?.?.?.?", verbose_name="hash of submitter ip address")
submitted_date = models.DateTimeField(verbose_name="submitted date", auto_now_add=True)
last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
approved = models.NullBooleanField(verbose_name="is image approved for verification")
priority = models.PositiveIntegerField(default=0,verbose_name="Priority value in transcription queue")
transCount = models.PositiveIntegerField(default=0,verbose_name="Number of transcriptions received for this image")

def __str__(self):
data = []
data.append("URL: {}".format(self.url))
data.append("ip_address: {}".format(self.ip_address))
data.append("submitted_date: {}".format(self.submitted_date))
data.append("last_modified: {}".format(self.last_modified))
data.append("hash: {}".format(self.hash))
data.append("transCount: {}".format(self.transCount))
url = models.URLField(verbose_name="Image url")
hash = models.CharField(max_length=64, unique=True, verbose_name="SHA256 hash of the url", default=None, null=True, blank=True)
submitter = models.CharField(max_length=64, verbose_name="Hash of submitter ip address", default="", blank=True)
submission_date = models.DateTimeField(verbose_name="Submission date", auto_now_add=True)
priority = models.PositiveIntegerField(verbose_name="Priority value in transcription queue", default=0)
confidence = models.PositiveIntegerField(verbose_name="Confidence score", default=0, blank=True)

result = ""
for d in data:
result += "<li>{}</li>".format(d)
result = "<ul>" + result + "</ul>"
return result
def save(self, *args, **kwargs):
self.hash = self.calculate_hash()
super().save(*args, **kwargs)

def calculate_hash(self):
return sha256(str(self.url).encode('utf-8')).hexdigest()

class TranscriptionData(models.Model):
class Transcription(models.Model):
class Meta:
indexes = [
models.Index(fields=['ip_address'], name='ip_address_idx')
]

puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="transcriptions")
ip_address = models.CharField(max_length=64, default="?.?.?.?", verbose_name="hash of submitter ip address")
submitted_date = models.DateTimeField(verbose_name="submitted date", auto_now=True)

bad_image = models.BooleanField(verbose_name="image is bad or hard to read")
orientation = models.CharField(max_length=10, default="", verbose_name="orientation direction from image")
rawdata = models.TextField(default="", verbose_name="Raw JSON taken in for later debugging.")
datahash = models.CharField(max_length=64, default="", verbose_name="sha256 hash for easier comparisons")

center = models.CharField(max_length=20, verbose_name="center")

wall1 = models.BooleanField(verbose_name="wall 1 (top)")
wall2 = models.BooleanField(verbose_name="wall 2 (top-right)")
wall3 = models.BooleanField(verbose_name="wall 3 (bottom-right)")
wall4 = models.BooleanField(verbose_name="wall 4 (bottom)")
wall5 = models.BooleanField(verbose_name="wall 5 (bottom-left)")
wall6 = models.BooleanField(verbose_name="wall 6 (top-left)")

link1 = models.CharField(max_length=7, verbose_name="link 1 (top)")
link2 = models.CharField(max_length=7, verbose_name="link 2 (top-right)")
link3 = models.CharField(max_length=7, verbose_name="link 3 (bottom-right)")
link4 = models.CharField(max_length=7, verbose_name="link 4 (bottom)")
link5 = models.CharField(max_length=7, verbose_name="link 5 (bottom-left)")
link6 = models.CharField(max_length=7, verbose_name="link 6 (top-left)")

def __str__(self):
return "{} {} {}".format(self.center, self.wall1, self.link1)


class BadImage(models.Model):
puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="badimages")
last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
badCount = models.PositiveIntegerField(default=0,verbose_name="how often this image was reported as bad")

class Meta:
unique_together = ('id', 'puzzlePiece',)

class RotatedImage(models.Model):
puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="rotatedimages")
last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
rotatedCount = models.PositiveIntegerField(default=0,verbose_name="how often this image was reported as incorrectly rotated")

class Meta:
unique_together = ('id', 'puzzlePiece',)

class ConfidenceTracking(models.Model):
puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="confidences")
last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
confidence = models.PositiveIntegerField(default=0,verbose_name="how confident are we in this image, 0 to 100")

class Meta:
unique_together = ('id', 'puzzlePiece',)


class ConfidentSolution(models.Model):
puzzlePiece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="confidentsolutions")
last_modified = models.DateTimeField(verbose_name="last modified date", auto_now=True)
confidence = models.PositiveIntegerField(default=0,verbose_name="how confident are we in this image, 0 to 100")
datahash = models.CharField(max_length=64, default="", verbose_name="sha256 hash for easier comparisons")

center = models.CharField(max_length=20, verbose_name="center")

wall1 = models.BooleanField(verbose_name="wall 1 (top)")
wall2 = models.BooleanField(verbose_name="wall 2 (top-right)")
wall3 = models.BooleanField(verbose_name="wall 3 (bottom-right)")
wall4 = models.BooleanField(verbose_name="wall 4 (bottom)")
wall5 = models.BooleanField(verbose_name="wall 5 (bottom-left)")
wall6 = models.BooleanField(verbose_name="wall 6 (top-left)")

link1 = models.CharField(max_length=7, verbose_name="link 1 (top)")
link2 = models.CharField(max_length=7, verbose_name="link 2 (top-right)")
link3 = models.CharField(max_length=7, verbose_name="link 3 (bottom-right)")
link4 = models.CharField(max_length=7, verbose_name="link 4 (bottom)")
link5 = models.CharField(max_length=7, verbose_name="link 5 (bottom-left)")
link6 = models.CharField(max_length=7, verbose_name="link 6 (top-left)")

def copyFromTranscription(self, transcription):
self.puzzlePiece = transcription.puzzlePiece
self.center = transcription.center
self.wall1 = transcription.wall1
self.wall2 = transcription.wall2
self.wall3 = transcription.wall3
self.wall4 = transcription.wall4
self.wall5 = transcription.wall5
self.wall6 = transcription.wall6

self.link1 = transcription.link1
self.link2 = transcription.link2
self.link3 = transcription.link3
self.link4 = transcription.link4
self.link5 = transcription.link5
self.link6 = transcription.link6
# Metadata
puzzle_piece = models.ForeignKey(PuzzlePiece, on_delete=models.CASCADE, related_name="transcriptions", null=True, default=None)
submitter = models.CharField(max_length=64, verbose_name="Hash of submitter ip address", default="", blank=True)
submission_date = models.DateTimeField(verbose_name="Submission date", auto_now_add=True)
# Flags
bad_flag = models.BooleanField(verbose_name="Image is bad or hard to read", default=False)
rotation_flag = models.BooleanField(verbose_name="Image is rotated", default="")
# Center in short notation
center = models.CharField(max_length=1, verbose_name="center", default="")
# Walls
wall1 = models.BooleanField(verbose_name="Wall 1 (top)", default=False)
wall2 = models.BooleanField(verbose_name="Wall 2 (top-right)", default=False)
wall3 = models.BooleanField(verbose_name="Wall 3 (bottom-right)", default=False)
wall4 = models.BooleanField(verbose_name="Wall 4 (bottom)", default=False)
wall5 = models.BooleanField(verbose_name="Wall 5 (bottom-left)", default=False)
wall6 = models.BooleanField(verbose_name="Wall 6 (top-left)", default=False)
# Links
link1 = models.CharField(max_length=7, verbose_name="Link 1 (top)", default="")
link2 = models.CharField(max_length=7, verbose_name="Link 2 (top-right)", default="")
link3 = models.CharField(max_length=7, verbose_name="Link 3 (bottom-right)", default="")
link4 = models.CharField(max_length=7, verbose_name="Link 4 (bottom)", default="")
link5 = models.CharField(max_length=7, verbose_name="Link 5 (bottom-left)", default="")
link6 = models.CharField(max_length=7, verbose_name="Link 6 (top-left)", default="")
# SHA256 hash of 'center wallsAsBitstring link1 link2 link3 link4 link5 link6' for comparison
hash = models.CharField(max_length=64, verbose_name="SHA256 hash for comparison", default="", null=True, blank=True)

def save(self, *args, **kwargs):
self.sanitize_fields()
if self.bad_flag:
self.hash = None
else:
self.hash = self.calculate_hash()
super().save(*args, **kwargs)

def calculate_hash(self):
hashInput = (
f"{self.center} "
f"{str(int(self.wall1))}{str(int(self.wall2))}{str(int(self.wall3))}"
f"{str(int(self.wall4))}{str(int(self.wall5))}{str(int(self.wall6))} "
f"{self.link1} {self.link2} {self.link3} {self.link4} {self.link5} {self.link6}"
).encode("utf-8")
return sha256(hashInput).hexdigest()

def sanitize_fields(self):
fields = [ "center", "link1", "link2", "link3", "link4", "link5", "link6" ]
for field in fields:
self.__setattr__(field, self.__getattribute__(field).upper())
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a discussion with someone in #bug-reports that they found two transcriptions for separate images, with the same transcription, but different hashes.

I was not able to reproduce the hashes (and the no hash bug was found at this time, and more pressing) but my only guess is that there were extra white spaces in the center and/or link fields. I see that you've limited the field sizes appropriately to 1/7, but I'd suggest append .strip() to the field sanitation to prevent rejecting possibly valid transcriptions but just with extra spaces.

15 changes: 6 additions & 9 deletions src/collector/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,20 @@

api_router = routers.DefaultRouter()
api_router.register(r'pieces', views.PuzzlePieceViewSet)
api_router.register(r'transcriptions', views.TranscriptionDataViewSet)
api_router.register(r'transcriptions', views.TranscriptionViewSet)

urlpatterns = [
path("", views.index, name="index"),
path('api/', include(api_router.urls)),
path("puzzlepieces/submit", views.puzzlepieceSubmit, name="puzzlepieceSubmit"),
path("puzzlepieces/", views.PuzzlepieceIndex.as_view(), name="puzzlepieceIndex"),
path("puzzlepieces/<int:image_id>/", views.puzzlepieceView, name="puzzlepieceView"),
path("puzzlepieces/submit", views.submit_puzzle_piece, name="puzzlepieceSubmit"),
path("puzzlepieces/", views.PuzzlePieceIndex.as_view(), name="puzzlepieceIndex"),
path("puzzlepieces/<int:image_id>/", views.puzzle_piece_view, name="puzzlepieceView"),
path("transcriptions", views.TranscriptionsIndex.as_view(), name="transcriptions"),
path("transcriptions/<int:transcription_id>", views.transcriptionsDetail, name="transcriptionsDetail"),
path("transcriptions/<int:transcription_id>", views.transcriptions_detail, name="transcriptionsDetail"),
path("transcriptions/guide", views.transcriptionGuide, name="transcriptionGuide"),
path("transcribe", views.TranscribeIndex.as_view(), name="transcribe"),
path("transcribe/<int:puzzlepiece_id>", views.processTranscription, name="transcribeResults"),
path("confidence", views.ConfidenceIndex.as_view(), name="confidenceIndex"),
path("confidence/<int:confidence_id>", views.confidenceDetail, name="confidenceDetail"),
path("transcribe/<int:puzzlepiece_id>", views.process_transcription, name="transcribeResults"),
path("solutions", views.ConfidenceSolutionIndex.as_view(), name="confidenceSolutionIndex"),
path("solutions/<int:solution_id>", views.confidenceSolutionDetail, name="confidenceSolutionDetail"),
path("export/verified/csv", views.exportVerifiedCSV, name="exportVerifiedCSV"),
path("export/pieces/csv", views.exportPiecesCSV, name="exportPiecesCSV"),
path("export/transcriptions/csv", views.exportTranscriptionsCSV, name="exportTranscriptionsCSV"),
Expand Down
33 changes: 33 additions & 0 deletions src/collector/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import hmac
from urllib.parse import urlparse
from hashlib import sha256
from django.conf import settings

def get_client_ip(request):
try:
x_forwarded_for = request.META.get("HTTP_X_FORWARDED_FOR")
if x_forwarded_for:
ip = x_forwarded_for.split(",")[0]
else:
ip = request.META.get("REMOTE_ADDR")
return ip
except Exception:
return None

def get_dict_value(dict, key, default):
if dict and key in dict:
return dict[key]
return default

# When exporting data, we shouldn't really make hash(ip) public because it's
# too easy to reverse. Use HMAC with SECRET_KEY as a keyed hash, to prevent
# reversing while still being usable as a unique identifier within a single
# exported set of data
def disguise_client_ip(ip):
key = settings.SECRET_KEY.encode("utf-8")
encoded_ip = ip.encode("utf-8")
return hmac.new(key, encoded_ip, sha256).hexdigest()

def is_image_url(url):
file_ext = url.path.lower().split('.')[-1]
return file_ext in [ "jpg", "jpeg", "png" ]
Loading