FraudScore / count_authors.py
TheFrenchDemos's picture
désactivation
60f6737 verified
raw
history blame
3.51 kB
#Count_authors serait à l'origine d'un runtime error de Fraudscore, apparu spontanément le 15 juin 2025
#HuggingFace envoie le message:
# "We have detected the following secret in spaces/TheFrenchDemos/FraudScore at the revision fd81b6050a663bae8e3ce6975c22635eb665c8dd.
# "Flickr (status: active) in count_authors.py
# "We strongly advise you rotate this secret, in particular if your repository is public.
# "Read more about our Secret scanning feature: https://huggingface.co/docs/hub/en/security-secrets
#or ce module n'est pas utile pour FraudScore
#il est désactivé par la mise en commentaire des lignes 89, et suppression des codes 19 à 21, à retrouver au besoin dans reward simulator
import tqdm
from multiprocessing import Pool, cpu_count
import signal
import sys
import time
from flickrapi import FlickrAPI
# Add Flickr configuration
#FLICKR_API
#FLICKR_API
#flickr =
def get_photo_id(url):
"""Extract photo ID from Flickr URL"""
try:
return url.split('/')[-1].split('_')[0]
except:
return None
def get_other_info(url):
"""Get author information from Flickr"""
try:
photo_id = get_photo_id(url)
if photo_id:
# wait for 0.1 second
time.sleep(0.1)
photo_info = flickr.photos.getInfo(photo_id=photo_id)
license = photo_info['photo']['license']
owner = photo_info['photo']['owner']
flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}"
return {
'username': owner.get('username', ''),
'realname': owner.get('realname', ''),
'nsid': owner.get('nsid', ''),
'flickr_url': flickr_url,
'license': license
}
except:
pass
return {
'username': 'Unknown',
'realname': 'Unknown',
'nsid': '',
'flickr_url': '',
'license': 'Unknown'
}
def init_worker():
"""Initialize worker process to handle signals"""
signal.signal(signal.SIGINT, signal.SIG_IGN)
def process_url(url):
try:
return get_other_info(url)
except Exception as e:
return {
'username': 'Error',
'realname': str(e),
'nsid': '',
'flickr_url': url,
'license': 'Unknown'
}
def process_urls_in_chunks(urls, chunk_size=100000):
authors = []
with Pool(cpu_count(), initializer=init_worker) as pool:
try:
# Process URLs in chunks
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
chunk_results = list(tqdm.tqdm(
pool.imap(process_url, chunk),
total=len(chunk),
desc=f"Processing chunk {i//chunk_size + 1}"
))
authors.extend(chunk_results)
except KeyboardInterrupt:
pool.terminate()
pool.join()
print("\nProcessing interrupted by user")
sys.exit(1)
return authors
#if __name__ == "__main__":
urls_file = "data/openimages_urls.txt"
with open(urls_file) as f:
urls = [url.strip() for url in f.readlines()][:100000]
authors = process_urls_in_chunks(urls)
# Count unique authors
unique_authors = len(set([author['username'] for author in authors]))
print(f"unique_authors: {unique_authors}")
print(f"Number of unique authors: {unique_authors}")