From 55e2ae9df950f4cd6f07de8902818ff0f0c57eba Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 3 Nov 2023 16:31:53 -0600 Subject: [PATCH] add retry --- README.md | 21 ++++++++++++++++++++- exfiltrate.py | 39 +++++++++++++++++++++++++++++++------- pkg/thread.py | 52 ++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index aec3d45..8625a8d 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,28 @@ You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAI So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm bringing a whole lot of CHAOS with me. +### Install + +```shell +pip install -r requirements.txt +``` + +### Use + +Example: + +```shell +python3 exfiltrate.py \ + https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \ + --zoom 20 \ + --referer https://maps.nlsc.gov.tw/ \ + --bbox 25.076387 121.68951 25.068282 121.700175 \ + --threads 30 +``` + ### ArcGIS -- Set `Stretch type` to `Esri`, which is the only stretch type that works with the background mask. +??? ### Credits diff --git a/exfiltrate.py b/exfiltrate.py index fc030e8..42c89b6 100644 --- a/exfiltrate.py +++ b/exfiltrate.py @@ -49,22 +49,47 @@ if __name__ == '__main__': r_headers['Referer'] = args.referer tiles = [] + retries = [] total_downloaded = 0 row_i = min_row - for row in tqdm(range(min_row, max_row + 1), desc=f'Row {row_i}'): + row_iter = range(min_row, max_row + 1) + row_bar = tqdm(total=len(row_iter), desc=f'Row {row_i}', postfix={'failures': len(retries)}) + for row in row_iter: row_i = row - bar = tqdm(total=len(range(min_col, max_col + 1)), leave=False) + col_bar = tqdm(total=len(range(min_col, max_col + 1)), leave=False) with ThreadPoolExecutor(args.threads) as executor: futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output)) for col in range(min_col, max_col + 1)] for future in as_completed(futures): result = future.result() if result: result_row, result_col, new_image = result - tiles.append((result_row, result_col)) - if new_image: + if new_image == 'success': total_downloaded += 1 - bar.update() - bar.close() + tiles.append((result_row, result_col)) + elif new_image == 'exist': + tiles.append((result_row, result_col)) + elif new_image == 'failure': + retries.append((result_row, result_col)) + row_bar.set_postfix({'failures': len(retries)}) + col_bar.update() + col_bar.close() + row_bar.update() + row_bar.close() + + col_bar = tqdm(total=len(retries), desc=f'Tile Retries') + with ThreadPoolExecutor(args.threads) as executor: + futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output)) for row, col in retries] + for future in as_completed(futures): + result = future.result() + if result: + result_row, result_col, new_image = result + tiles.append((result_row, result_col)) + if new_image == 'success': + total_downloaded += 1 + elif new_image == 'failure': + col_bar.write(f'{(result_row, result_col)} failed!') + col_bar.update() + col_bar.close() print(f'Downloaded {total_downloaded} images.') @@ -97,7 +122,7 @@ if __name__ == '__main__': mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero. for i in range(3): # Iterate over each band. # For these pixels, set zero bands to one. - tile_data[mask & (tile_data[:, :, i] == 0), i] = 1 + tile_data[mask & (tile_data[:, :, i] == 0), i] = 0.1 # Calculate the position of the tile in the image data array. row_pos = (row - min_row) * tile_size diff --git a/pkg/thread.py b/pkg/thread.py index 31e9010..fac43fb 100644 --- a/pkg/thread.py +++ b/pkg/thread.py @@ -1,23 +1,45 @@ +import shutil +import time +from pathlib import Path + import requests +from tqdm import tqdm from pkg.proxies import PROXIES from .image import is_png +def del_path(p: Path): + if p.is_file() or p.is_symlink(): + p.unlink(missing_ok=True) + else: + shutil.rmtree(p) + + def download_tile(task): row, col, base_url, r_headers, output = task - output_path = output / f"{row}_{col}.png" - if output_path.exists(): - assert is_png(output_path) - return row, col, False - tile_url = f"{base_url}/{row}/{col}" - response = requests.get(tile_url, headers=r_headers, proxies=PROXIES) - if response.status_code == 200: - if not response.headers.get('Content-Type') == 'image/png': - raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') - with open(output_path, "wb") as f: - f.write(response.content) - assert is_png(output_path) - return row, col, True - else: - print(f"Failed to download tile {row}_{col}") + try: + output_path: Path = output / f"{row}_{col}.png" + if output_path.exists(): + if not is_png(output_path): + # Delete the file and try again later. + del_path(output_path) + return row, col, 'failure' + return row, col, 'exist' + tile_url = f"{base_url}/{row}/{col}" + response = requests.get(tile_url, headers=r_headers, proxies=PROXIES, timeout=60) + if response.status_code == 200: + if not response.headers.get('Content-Type') == 'image/png': + raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}') + with open(output_path, "wb") as f: + f.write(response.content) + if not is_png(output_path): + del_path(output_path) + return row, col, 'failure' + return row, col, 'success' + else: + print(f"Failed to download tile {row}_{col}") + except Exception as e: + # traceback.print_exc() + tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}') + return row, col, 'failure'