diff --git a/exfiltrate.py b/exfiltrate.py index a1ca3b1..52a8157 100644 --- a/exfiltrate.py +++ b/exfiltrate.py @@ -30,6 +30,7 @@ if __name__ == '__main__': parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.") parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1') + parser.add_argument('--convert', action='store_true', help="Convert tiles to PNG.") args = parser.parse_args() if args.download_loops <= 0: @@ -79,7 +80,7 @@ if __name__ == '__main__': else: col_bar = tqdm(total=len(col_iter), leave=False) with (ThreadPoolExecutor(args.dl_threads) as executor): - futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter] + futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, args.convert)) for col in col_iter] for future in as_completed(futures): result = future.result() if result: diff --git a/pkg/image.py b/pkg/image.py index 4fac620..2c4bdba 100644 --- a/pkg/image.py +++ b/pkg/image.py @@ -20,7 +20,8 @@ def is_png(file_path): """ try: img = Image.open(file_path) - return img.format == 'PNG' + img.verify() + return img.format == 'PNG', img.format except PIL.UnidentifiedImageError as e: # tqdm.write(str(e)) - return False + return False, None diff --git a/pkg/thread.py b/pkg/thread.py index 76468bf..016c342 100644 --- a/pkg/thread.py +++ b/pkg/thread.py @@ -2,6 +2,7 @@ import shutil from pathlib import Path import requests +from PIL import Image from tqdm import tqdm from pkg.proxies import PROXIES @@ -16,25 +17,34 @@ def del_path(p: Path): def download_tile(task): - row, col, base_url, r_headers, output, use_proxy = task + row, col, base_url, r_headers, output, use_proxy, convert_to_png = task corrupted_image = False try: output_path: Path = output / f"{row}_{col}.png" if output_path.exists(): - if not is_png(output_path): + valid_png_file, image_type = is_png(output_path) + if not valid_png_file: # We will re-download the image. Don't need to delete it, just overwrite it. # del_path(output_path) corrupted_image = True - tqdm.write(f'Cannot identify image file: "{output_path}", deleting and retrying...') + tqdm.write(f'Cannot identify image file: "{output_path}" (is {image_type}), deleting and retrying...') else: return row, col, 'exist' tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://') response = requests.get(tile_url, headers=r_headers, proxies=PROXIES if use_proxy else None, timeout=60) if response.status_code == 200: - if not response.headers.get('Content-Type') == 'image/png': + if not convert_to_png and not response.headers.get('Content-Type') == 'image/png': + # If we will convert the image to a PNG, ignore this header. raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') with open(output_path, "wb") as f: f.write(response.content) + + if convert_to_png: + img = Image.open(output_path) + if img.format != 'PNG': + img.save(output_path, format='PNG') + tqdm.write(f'Converted {output_path} from {img.format} to PNG') + # Recheck the PNG if it was corrupted. if corrupted_image and not is_png(output_path): print(f"Retry for {row}_{col} failed a second time: cannot identify image file")