...

c0ntradicti0n · Feb 20, 2024 · be2e732 · be2e732
1 parent 3ab1b03
commit be2e732
Show file tree

Hide file tree

Showing 8 changed files with 162 additions and 39 deletions.
diff --git a/classifier/data/triangle_pos.py b/classifier/data/triangle_pos.py
@@ -124,7 +124,7 @@ def make_sample(which_kind, which):
             else:
                 kind = ConceptPosition.MORE_COMPOUND
         elif which_kind == 2 or which_kind == 4:
-            path2, extension = random_123_string(3, start_string=path, longer=True, return_extension=True, global_prefix=GLOBAL_PREFIX)
+            path2 = random_123_string(3, global_prefix=GLOBAL_PREFIX)
             file_dict2 = get_from_triangle(path2)
             sub_path, matching_entries = find_matching_entries(file_dict2, path2)
             _a, _b, _c, _d = matching_entries
@@ -143,7 +143,7 @@ def make_sample(which_kind, which):
             if which == 4:
                 X = d
                 d = _d
-                kind = ConceptPosition.SUMMARIZING_CONCEPT
+                kind = ConceptPosition.LESS_COMPOUND
         elif which_kind == 3 or which_kind == 5 or which_kind== 6 or which_kind ==7:
             path2, extension = random_123_string(4, min_length=2 ,start_string=path, longer=True, return_extension=True, global_prefix=GLOBAL_PREFIX)
 

diff --git a/classifier/data/weight_pos.py b/classifier/data/weight_pos.py
@@ -0,0 +1,37 @@
+from collections import Counter
+from pprint import pprint
+
+from classifier.data.triangle_pos import data_path, yield_from_file, ConceptPosition
+from lib.json import decode, encode
+
+print ("loading")
+all_samples = []
+n = 0
+with open(data_path, 'r') as file:
+    lines = file.readlines()
+    for line in lines:
+        # Deserialize the JSON string back into a tuple
+        item = decode(line, ConceptPosition)
+        all_samples.append(item)
+        n += 1
+print ("loaded")
+c_labels = Counter([_[0][1] for _ in all_samples])
+("counted")
+print (c_labels)
+
+relative_prob = {
+    k: 1/(v/n)
+    for k, v in
+    c_labels.items()
+}
+pprint(relative_prob)
+
+a = sum(relative_prob.values())
+relative_prob = {
+    k.name: v/a
+    for k, v in
+    relative_prob.items()
+}
+pprint(relative_prob)
+
+print ([relative_prob.get(k, 0) for k in list(ConceptPosition.__members__)])
diff --git a/classifier/model/som.py b/classifier/model/som.py
@@ -11,16 +11,16 @@ def generate_dummy_sequences(batch_size=64, seq_len=5, embedding_dim=128, num_cl
 
 
 class Som(nn.Module):
-    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True, dropout_rate=0.0):
+    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=3, bidirectional=True, dropout_rate=0.2):
         super(Som, self).__init__()
-        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
+        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_dim, kernel_size=5, padding=1)
         self.relu = nn.ReLU()
         self.bidirectional = bidirectional
         # Adjusting the hidden dimension if using a bidirectional GRU, as the outputs will be concatenated
         final_hidden_dim = hidden_dim * 2 if bidirectional else hidden_dim
 
-        self.gru = nn.GRU(
-            input_size=hidden_dim,
+        self.gru = nn.LSTM(
+            input_size=embedding_dim,
             hidden_size=hidden_dim,
             num_layers=num_layers,
             batch_first=True,
@@ -33,10 +33,10 @@ def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=1, bidirect
         self.fc = nn.Linear(final_hidden_dim, output_dim)
 
     def forward(self, x):
-        x = x.transpose(1, 2)  # Conv1D expects (batch, channels, seq_len)
-        x = self.conv1d(x)
-        x = self.relu(x)
-        x = x.transpose(1, 2)  # Back to (batch, seq_len, channels) for GRU
+        #x = x.transpose(1, 2)  # Conv1D expects (batch, channels, seq_len)
+        #x = self.conv1d(x)
+        #x = self.relu(x)
+        #x = x.transpose(1, 2)  # Back to (batch, seq_len, channels) for GRU
         out, _ = self.gru(x)
 
         # If using a bidirectional GRU, out will contain concatenated hidden states from both directions
@@ -96,9 +96,40 @@ def generate_sequences(data, sequence_length=5):
     output_size = 10  # Number of classes (based on the sum's decade)
     num_layers = 1  # Number of GRU layers
 
-    model = Som(input_size, hidden_size, output_size, num_layers)
+    # Parameters for the Transformer model
+    input_size = 28 * 28  # Flattened MNIST images
+    d_model = 512  # Size of the embedding
+    output_size = 10  # Number of classes (based on the sum's decade)
+    num_layers = 2  # Number of Transformer encoder layers
+    nhead = 8  # Number of heads in the multiheadattention models
+    dim_feedforward = 2048  # Size of the feedforward model in nn.TransformerEncoder
+    dropout = 0.1  # Dropout rate
+
+    # Check if GPU is available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Instantiate the model and move it to the device
+    model = TransformerModel(input_size, d_model, output_size, nhead, num_layers, dim_feedforward, dropout).to(device)
+
+    # Remaining code for data loading and transformation
+    transform = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: torch.flatten(x))])
+    mnist_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
+    train_data, test_data = train_test_split(mnist_data, test_size=0.2, random_state=42)
 
-    # Model
+    # Generate sequences for training and testing
+    # Ensure that you move your tensors to the device where necessary
+    train_sequences, train_labels = generate_sequences(train_data,
+                                                       5)  # Modify this function to move tensors to the device
+    test_sequences, test_labels = generate_sequences(test_data, 5)  # Modify this function to move tensors to the device
+
+    train_sequences, train_labels = train_sequences.to(device), train_labels.to(device)
+    test_sequences, test_labels = test_sequences.to(device), test_labels.to(device)
+
+    # DataLoader
+    train_dataset = TensorDataset(train_sequences, train_labels)
+    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
+
+    # Training loop
     criterion = nn.CrossEntropyLoss()
     optimizer = optim.Adam(model.parameters(), lr=0.001)
 
@@ -107,10 +138,12 @@ def generate_sequences(data, sequence_length=5):
         model.train()
         total_loss = 0
         for sequences, labels in train_loader:
+            sequences, labels = sequences.to(device), labels.to(device)
             optimizer.zero_grad()
             outputs = model(sequences)
             loss = criterion(outputs, labels)
             loss.backward()
             optimizer.step()
             total_loss += loss.item()
-        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
+        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
+
diff --git a/classifier/models/models.yml b/classifier/models/models.yml
@@ -27,7 +27,9 @@ models:
   som:
     <<: *som
     n_samples: 5
-    n_classes: 12
+    n_classes: 9
+    loss_weights: [ 0, 0.15172247835473368, 0.1476857349332255, 0.14668625042352593, 0.07449548855387905, 0.14560698498534902, 0.1154368336454986, 0.11241477773875054, 0.10595145136503747 ]
+
     # 0 - stay at higher concept
     # 1 - stay at 1
     # 2 - stay at 2
@@ -38,15 +40,18 @@ models:
     # 7 - deeper at 2
     # 8 - deeper at 3
 
-    batch_size: 64
-    batches_per_epoch: 170
+    batch_size: 256
+    batches_per_epoch: 200
 
     embedding_dim: 4096
     hidden_dim: 1024
 
+    weight_decay: 0.000000001
+
 
     from_module: classifier.data.triangle_pos
 
+
     f1: micro
 
     classes:

diff --git a/classifier/train/ntuple.py b/classifier/train/ntuple.py
@@ -1,3 +1,4 @@
+import gc
 import logging
 import os
 
@@ -49,11 +50,22 @@ def colorized_comparison(prefix, predicted_labels, gold_labels):
     )
 
 def train(config_name):
+    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Empty the CUDA cache
+    #torch.cuda.empty_cache()
+
+    # Call the garbage collector to remove the objects from memory
+    gc.collect()
+
+    # Verify the memory status (optional)
+    #print(torch.cuda.memory_summary())  # Provides a summary of CUDA memory usage
 
     config = get_model_config(config_name)
-    model = get_model(config)
-    optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01)
-    criterion = nn.CrossEntropyLoss()
+    model = get_model(config)#.to(device)
+
+    optimizer = torch.optim.Adagrad(model.parameters(), lr=0.003, lr_decay=0.01)
+
+    criterion = nn.CrossEntropyLoss(weight=None if not config.get("loss_weights") else torch.tensor(list(config.loss_weights)))
     data_gen = DataGenerator(config)
 
 
@@ -63,7 +75,7 @@ def train(config_name):
     scheduler = CyclicLR(
         optimizer,
         mode="exp_range",
-        gamma=0.99,
+        gamma=0.999,
         base_lr=0,
         max_lr=0.006,
         step_size_up=config.batches_per_epoch * 0.7,
@@ -129,7 +141,10 @@ def train(config_name):
 
 
             # Backward pass and optimizer step
+
             loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  # Gradient clipping
+
             optimizer.step()
 
             # Calculate F-score for training data using reshaped tensors
@@ -157,7 +172,7 @@ def train(config_name):
             print(
                 f"Epoch {epoch + 1}, {batch=}, {loss=}, {train_fscore=:.2f} {optimizer.param_groups[0]['lr']:.2E}"
             )
-            if train_fscore > max_fscore and train_fscore > 0.5:
+            if train_fscore > max_fscore and train_fscore > 0.7:
                 break
 
         avg_loss = total_loss / config.batches_per_epoch

diff --git a/lib/config.py b/lib/config.py
@@ -0,0 +1,3 @@
+import os
+
+system_path = os.environ.get("SYSTEM", "../dialectics")
diff --git a/lib/git_tools.py b/lib/git_tools.py
@@ -0,0 +1,31 @@
+import subprocess  # For running shell commands, 🚀🐚🚀
+                   # With Python's touch, so grand! 🌟🐍🌟
+
+# A function so neat, a treat to repeat, 🍬🎶🍬
+def check_git_config():  # Let's take a seat! 🪑🌟🪑
+    try:
+        # For email, we'll peek, with Python technique! 📧🔍📧
+        email = subprocess.check_output(
+            ["git", "config", "--global", "user.email"],
+            text=True).strip()
+        # For name, the same, in this Git game! 🎮🔍🎮
+        name = subprocess.check_output(
+            ["git", "config", "--global", "user.name"],
+            text=True).strip()
+
+        # If found around, let joy resound! 🎉✨🎉
+        if email and name:
+            print(f"Email found: {email}, 📧🌈📧\nName's around: {name}! 🌟👤🌟")
+            return True
+        else:
+            print("Some configs are missing, 🚫🤔🚫\nLet's keep on fishing! 🎣🌊🎣")
+            return False
+    except subprocess.CalledProcessError:
+        # If error's in sight, we'll set it right! 🚨🛠️🚨
+        print("Git configs not found, 🚫🔍🚫\nIn silence they're bound. 🤫🌌🤫")
+        return None
+
+
+if __name__ == "__main__":
+    # Now let's invoke, with a stroke of hope! 🌈🙏🌈
+    check_git_config()
diff --git a/lib/json.py b/lib/json.py
@@ -1,28 +1,27 @@
 import json
 from enum import Enum
-
 class EnumCodec(json.JSONEncoder):
-    def __init__(self, enum_type, *args, **kwargs):
-        self.enum_type = enum_type
-        super().__init__(*args, **kwargs)
-
     def default(self, obj):
         if isinstance(obj, Enum):
             return {"__enum__": f"{obj.__class__.__name__}.{obj.name}"}
-        return super().default(obj)
+        else:
+            return super().default(obj)
 
-    @classmethod
-    def decode(cls, enum_type):
-        def decode_enum(dct):
-            if "__enum__" in dct:
-                enum_name, member_name = dct["__enum__"].split('.')
-                if enum_name == enum_type.__name__:
-                    return enum_type[member_name]
-            return dct
-        return decode_enum
+    @staticmethod
+    def decode_enum(dct, enum_type=None):
+        if "__enum__" in dct:
+            enum_name, member_name = dct["__enum__"].split('.')
+            # Assuming enum_type is provided and matches enum_name
+            if enum_type and enum_type.__name__ == enum_name:
+                return enum_type[member_name]
+        return dct
 
-def encode(enum_instance, enum_type):
-    return json.dumps(enum_instance, cls=EnumCodec, enum_type=enum_type)
+def encode(data, enum_type=None):
+    # Convert enum keys to strings
+    if isinstance(data, dict):
+        data = {k.name if isinstance(k, Enum) else k: v for k, v in data.items()}
+    return json.dumps(data, cls=EnumCodec)
 
 def decode(json_str, enum_type):
-    return json.loads(json_str, object_hook=EnumCodec.decode(enum_type))
+    object_hook = lambda dct: EnumCodec.decode_enum(dct, enum_type=enum_type)
+    return json.loads(json_str, object_hook=object_hook)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import os

		system_path = os.environ.get("SYSTEM", "../dialectics")