update regex

This commit is contained in:
Junyi Hou
2024-08-11 20:25:41 +08:00
parent 2d1176b969
commit 94ab2b6535
3 changed files with 42 additions and 82 deletions
+6
View File
@@ -0,0 +1,6 @@
{
"black-formatter.args": [
"--line-length",
"200"
]
}
+35 -79
View File
@@ -5,7 +5,6 @@ import pickle
import re import re
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from sqlite3 import Connection, Cursor
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import UnableToSetCookieException from selenium.common.exceptions import UnableToSetCookieException
@@ -14,16 +13,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm from tqdm import tqdm
from utils import ( from utils import check_key, db_close, db_delete, db_get_all_keys, db_insert, db_key_exists, db_open, db_remove_duplication
check_key,
db_close,
db_delete,
db_get_all_keys,
db_insert,
db_key_exists,
db_open,
db_remove_duplication,
)
FORMAT = "%(message)s" FORMAT = "%(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT, datefmt="[%X]") logging.basicConfig(level=logging.INFO, format=FORMAT, datefmt="[%X]")
@@ -33,13 +23,15 @@ log = logging.getLogger("ChatGPT-API-Leakage")
class APIKeyLeakageScanner: class APIKeyLeakageScanner:
def __init__(self, db_file: str, keywords: list, languages: list): def __init__(self, db_file: str, keywords: list, languages: list):
self.db_file = db_file self.db_file = db_file
log.info(f"📂 Opening database file {self.db_file}") log.info(f"📂 Opening database file {self.db_file}")
self.con, self.cur = db_open(self.db_file) self.con, self.cur = db_open(self.db_file)
self.keywords = keywords self.keywords = keywords
self.languages = languages self.languages = languages
self.candidate_urls = [ self.candidate_urls = [
f"https://github.com/search?q={keyword}+AND+%28%2Fsk-%5Ba-zA-Z0-9%5D%7B48%7D%2F%29+language%3A{language}&type=code&ref=advsearch" # f"https://github.com/search?q={keyword}+AND+%28%2Fsk-%5Ba-zA-Z0-9%5D%7B48%7D%2F%29+language%3A{language}&type=code&ref=advsearch"
f"https://github.com/search?q={keyword}+AND+%28%2Fsk-proj-%5BA-Za-z0-9%5D%7B20%7DT3BlbkFJ%5BA-Za-z0-9%5D%7B20%7D%2F%29+language%3A{language}&type=code&ref=advsearch"
for language in self.languages for language in self.languages
for keyword in self.keywords for keyword in self.keywords
] ]
@@ -59,18 +51,9 @@ class APIKeyLeakageScanner:
self.driver.add_cookie(cookie) self.driver.add_cookie(cookie)
except UnableToSetCookieException as e: except UnableToSetCookieException as e:
log.debug(f"🟡 Warning, unable to set a cookie {cookie}") log.debug(f"🟡 Warning, unable to set a cookie {cookie}")
except EOFError as e: except (EOFError, pickle.UnpicklingError):
if os.path.exists("cookies.pkl"): os.remove("cookies.pkl") if os.path.exists("cookies.pkl") else None
os.remove("cookies.pkl") log.error("🔴 Error, unable to load cookies, invalid cookies has been removed, please restart.")
log.error(
"🔴 Error, unable to load cookies, invalid cookies has been removed, please restart."
)
except pickle.UnpicklingError as e:
if os.path.exists("cookies.pkl"):
os.remove("cookies.pkl")
log.error(
"🔴 Error, load cookies failed, invalid cookies has been removed, please restart."
)
def _test_cookies(self): def _test_cookies(self):
""" """
@@ -79,26 +62,18 @@ class APIKeyLeakageScanner:
log.info("🤗 Redirecting ...") log.info("🤗 Redirecting ...")
self.driver.get("https://github.com/") self.driver.get("https://github.com/")
if self.driver.find_elements( if self.driver.find_elements(by=By.XPATH, value="//*[contains(text(), 'Sign in')]"):
by=By.XPATH, value="//*[contains(text(), 'Sign in')]"
):
return False return False
return True return True
def _hit_rate_limit(self):
return self.driver.find_elements(
by=By.XPATH,
value="//*[contains(text(), 'You have exceeded a secondary rate limit')]",
)
def login_to_github(self): def login_to_github(self):
log.info("🌍 Opening Chrome ...") log.info("🌍 Opening Chrome ...")
self.options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
self.options.add_argument("--ignore-certificate-errors") options.add_argument("--ignore-certificate-errors")
self.options.add_argument("--ignore-ssl-errors") options.add_argument("--ignore-ssl-errors")
self.driver = webdriver.Chrome(options=self.options) self.driver = webdriver.Chrome(options=options)
self.driver.implicitly_wait(3) self.driver.implicitly_wait(3)
cookie_exists = os.path.exists("cookies.pkl") cookie_exists = os.path.exists("cookies.pkl")
@@ -113,8 +88,7 @@ class APIKeyLeakageScanner:
self._load_cookies() self._load_cookies()
if not self._test_cookies(): if not self._test_cookies():
if os.path.exists("cookies.pkl"): os.remove("cookies.pkl") if os.path.exists("cookies.pkl") else None
os.remove("cookies.pkl")
log.error("🔴 Error, you are not logged in, please restart and try again.") log.error("🔴 Error, you are not logged in, please restart and try again.")
exit(1) exit(1)
@@ -122,27 +96,20 @@ class APIKeyLeakageScanner:
def _process_url(self, url: str): def _process_url(self, url: str):
self.driver.get(url) self.driver.get(url)
pattern = re.compile(r"sk-[a-zA-Z0-9]{48}") pattern = re.compile(r"sk-proj-[A-Za-z0-9]{20}T3BlbkFJ[A-Za-z0-9]{20}")
while True: while True:
# If current webpage is reached the rate limit, then wait for 30 seconds # If current webpage is reached the rate limit, then wait for 30 seconds
if self._hit_rate_limit(): if self.driver.find_elements(by=By.XPATH, value="//*[contains(text(), 'You have exceeded a secondary rate limit')]"):
for _ in tqdm(range(30), desc="⏳ Rate limit reached, waiting ..."): for _ in tqdm(range(30), desc="⏳ Rate limit reached, waiting ..."):
time.sleep(1) time.sleep(1)
self.driver.refresh() self.driver.refresh()
continue continue
# Expand all the code # Expand all the code
[ [element.click() for element in self.driver.find_elements(by=By.XPATH, value="//*[contains(text(), 'more match')]")]
element.click()
for element in self.driver.find_elements(
by=By.XPATH, value="//*[contains(text(), 'more match')]"
)
]
codes = self.driver.find_elements( codes = self.driver.find_elements(by=By.CLASS_NAME, value="code-list") # find all elements with class name 'f4'
by=By.CLASS_NAME, value="code-list"
) # find all elements with class name 'f4'
for element in codes: for element in codes:
apis = pattern.findall(element.text) apis = pattern.findall(element.text)
if len(apis) == 0: if len(apis) == 0:
@@ -156,20 +123,12 @@ class APIKeyLeakageScanner:
for idx, result in enumerate(results): for idx, result in enumerate(results):
db_insert(self.con, self.cur, apis[idx], result) db_insert(self.con, self.cur, apis[idx], result)
next_buttons = self.driver.find_elements( next_buttons = self.driver.find_elements(by=By.XPATH, value="//a[@aria-label='Next Page']")
by=By.XPATH, value="//a[@aria-label='Next Page']"
)
try: try:
WebDriverWait(self.driver, 5).until( WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, "//a[@aria-label='Next Page']")))
EC.presence_of_element_located(
(By.XPATH, "//a[@aria-label='Next Page']")
)
)
next_buttons = self.driver.find_elements( next_buttons = self.driver.find_elements(by=By.XPATH, value="//a[@aria-label='Next Page']")
by=By.XPATH, value="//a[@aria-label='Next Page']"
)
next_buttons[0].click() next_buttons[0].click()
except Exception as _: except Exception as _:
# log.info(" ⚪️ No more pages") # log.info(" ⚪️ No more pages")
@@ -177,27 +136,24 @@ class APIKeyLeakageScanner:
def _save_progress(self, from_iter: int): def _save_progress(self, from_iter: int):
with open(".progress.txt", "w") as file: with open(".progress.txt", "w") as file:
# Save the progress and timestamp
file.write(f"{from_iter}/{len(self.candidate_urls)}/{time.time()}") file.write(f"{from_iter}/{len(self.candidate_urls)}/{time.time()}")
def _load_progress(self): def load_progress(self):
if not os.path.exists(".progress.txt"): progress_file = ".progress.txt"
return 0 if not os.path.exists(progress_file):
with open(".progress.txt", "r") as file:
progress = file.read().strip().split("/")
last = int(progress[0])
totl = int(progress[1])
tmst = float(progress[2])
# if the time is less than 1 hour, then continue from the last progress
if time.time() - tmst < 3600 and totl == len(self.candidate_urls):
# ask the user if they want to continue from the last progress
action = input(f"🔍 Progress found, do you want to continue from the last progress ({last}/{totl})? [yes] | no: ")
if action.lower() == "yes" or action.lower() == "y" or action == "":
return int(progress[0])
else:
return 0
return 0 return 0
with open(progress_file, "r") as file:
last, totl, tmst = file.read().strip().split("/")
last, totl = int(last), int(totl)
if time.time() - float(tmst) < 3600 and totl == len(self.candidate_urls):
action = input(f"🔍 Progress found, do you want to continue from the last progress ({last}/{totl})? [yes] | no: ").lower()
if action in {"yes", "y", ""}:
return last
return 0
def search(self, from_iter: int = None): def search(self, from_iter: int = None):
pbar = tqdm( pbar = tqdm(
enumerate(self.candidate_urls), enumerate(self.candidate_urls),
@@ -206,7 +162,7 @@ class APIKeyLeakageScanner:
) )
if from_iter is None: if from_iter is None:
from_iter = self._load_progress() from_iter = self.load_progress()
for idx, url in enumerate(self.candidate_urls): for idx, url in enumerate(self.candidate_urls):
if idx < from_iter: if idx < from_iter:
+1 -3
View File
@@ -14,9 +14,7 @@ def db_get_all_keys(cur: Cursor) -> list:
def db_remove_duplication(con: Connection, cur: Cursor) -> None: def db_remove_duplication(con: Connection, cur: Cursor) -> None:
cur.execute( cur.execute("CREATE TABLE temp_table as SELECT apiKey, status, MAX(lastChecked) as lastChecked FROM APIKeys GROUP BY apiKey;")
"CREATE TABLE temp_table as SELECT apiKey, status, MAX(lastChecked) as lastChecked FROM APIKeys GROUP BY apiKey;"
)
cur.execute("DROP TABLE APIKeys;") cur.execute("DROP TABLE APIKeys;")
cur.execute("ALTER TABLE temp_table RENAME TO APIKeys;") cur.execute("ALTER TABLE temp_table RENAME TO APIKeys;")
con.commit() con.commit()