Skip to content

Commit

Permalink
Merge branch 'master' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
fmpfeifer committed Feb 14, 2021
2 parents b092864 + 8e15a4e commit 8d509d2
Show file tree
Hide file tree
Showing 5 changed files with 317 additions and 0 deletions.
138 changes: 138 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
66 changes: 66 additions & 0 deletions convert_wav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import concurrent.futures
import subprocess
import sys


WORKERS = 32

devnull = open(os.devnull, 'w')


def is_wav_file(file_to_test: str) -> bool:
output = subprocess.run(
['ffprobe.exe', file_to_test], capture_output=True).stderr
if b'pcm_s16le' in output:
return True
return False


def convert_wav_to_mp3(wav_file: str) -> bool:
if not is_wav_file(wav_file):
print(f"Arquivo {wav_file} não é um arquivo wav")
return False

print(f"Convertendo {wav_file} ...")
renamed_wav = wav_file + '_oldwav'
os.rename(wav_file, renamed_wav)
subprocess.call(['ffmpeg.exe', '-i', renamed_wav, '-codec:a',
'libmp3lame', '-qscale:a', '6', wav_file], stderr=devnull,
stdout=devnull)
os.unlink(renamed_wav)
print(f"Convertido {wav_file}")
return True


def collect_files_to_convert(source_folder):
files_to_convert = []
for current_folder, _, file_list in os.walk(source_folder):
for fname in file_list:
if fname.endswith('_Converted.wav'):
full_file = os.path.join(current_folder, fname)
files_to_convert.append(full_file)
return files_to_convert


def process_dir(dir):
files_to_convert = collect_files_to_convert(dir)
if len(dir) == 0:
print("Não foram enconrados arquivos para converter")
exit(0)
print(f"Convertendo {len(files_to_convert)} arquivos...")
with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as pool:
count = sum(pool.map(convert_wav_to_mp3, files_to_convert))
print(f"Convertidos {count} arquivos de {len(files_to_convert)}")


if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"É necessário um argumento (pasta do relatório)."
f" Foram fornecidos {len(sys.argv)-1}.")
exit(1)
dir = sys.argv[1]
if not os.path.isdir(dir):
print(f"{dir} não é um diretório")
exit(1)
process_dir(dir)
113 changes: 113 additions & 0 deletions deduper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import hashlib
import shutil
import sys


EXTENSIONS_NOT_TO_HARDLINK = ["-shm"]


def hashfile(filename: str) -> str:
md5 = hashlib.md5()
with open(filename, 'rb') as file:
chunk = file.read(8192)
while chunk:
md5.update(chunk)
chunk = file.read(8192)
return md5.hexdigest()


def create_hard_link(original, link):
link_folder, link_filename = os.path.split(link)
link_path = os.path.relpath(original, link_folder)
os.chdir(link_folder)
os.system(f'cmd /c mklink /H "{link_filename}" "{link_path}"')


def copy_file(source, destdir, destfile, fsize, hash=None):
if os.path.isfile(destfile):
if os.path.getsize(destfile) == fsize:
if hash is not None:
desthash = hashfile(destfile)
if desthash == hash:
print(
f"Skipping, {destfile} exist and has same hash as original...")
return
else:
print(f"Deleting {destfile}. Hash mismatch...")
os.remove(destfile)
else:
print(f"Skipping, {destfile} exists..")
# shutil.copystat(source, destfile)
return
print(f"Copying {source} to {destdir} ...")
shutil.copy(source, destdir)
shutil.copystat(source, destfile)


def shoud_try_hardlink(filename, fsize) -> bool:
if fsize == 0:
return False # do not hardlink zero size file
for ext in EXTENSIONS_NOT_TO_HARDLINK:
if filename.lower().endswith(ext):
return False
return True


def copy_tree(source_folder, dest_root):
hash_dict = dict()
saved = 0
for current_folder, _, file_list in os.walk(source_folder):
rel_folder = os.path.relpath(current_folder, source_folder)
dest_folder = os.path.join(dest_root, rel_folder)
os.makedirs(dest_folder, exist_ok=True)
for fname in file_list:
source_filename = os.path.join(source_folder, rel_folder, fname)
dest_file = os.path.join(dest_folder, fname)
fsize = os.path.getsize(source_filename)
hash = hashfile(source_filename)
if shoud_try_hardlink(source_filename, fsize):
key = (hash, fsize)
if key not in hash_dict:
hash_dict[key] = [dest_file, 1]
copy_file(source_filename, dest_folder,
dest_file, fsize, hash)
else:
[link_source, link_count] = hash_dict[key]
if link_count > 1000:
copy_file(source_filename, dest_folder,
dest_file, fsize)
hash_dict[key] = [dest_file, 1]
else:
if os.path.isfile(dest_file):
print(
f"Do not create hardlink, {dest_file} exists")
else:
print(f"Creating hardlink for "
f"{dest_file} with {link_source} ...")
create_hard_link(link_source, dest_file)
hash_dict[key][1] += 1
saved += fsize
else:
copy_file(source_filename, dest_folder, dest_file, fsize, hash)
print(f"Saved {saved} bytes with hardlinks")


if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"São necessários 2 argumentos,"
f" mas foram informados {len(sys.argv)-1}")
print("Informe a pasta de origem e a de destino da cópia.")
exit(1)
source = sys.argv[1]
dest = sys.argv[2]
if dest.endswith(":"):
dest += "\\"
if not os.path.isdir(source):
print(f"{source} não é um diretório")
exit(1)
if not os.path.isdir(dest):
print(f"{dest} não é um diretório")
exit(1)
copy_tree(source, dest)
exit(0)
Binary file added ffmpeg.exe
Binary file not shown.
Binary file added ffprobe.exe
Binary file not shown.

0 comments on commit 8d509d2

Please sign in to comment.