This repository has been archived by the owner on Dec 2, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_individuals.py
256 lines (209 loc) · 9 KB
/
add_individuals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from pathlib import Path
import typing
import pandas as pd
from django.core.exceptions import ObjectDoesNotExist
from start_dj import start_django_lite
start_django_lite()
from ontogen import Ontology
from ontogen.converter import OntogenConverter, OwlClass, OwlIndividual
from autogen_db_models import imdb, awards
from engine import Session
import attr
from app import models
from spacy_nlp import get_not_person_name
import spacy_nlp as snlp
_T = typing.TypeVar('_T')
FilePathOrBuffer = typing.Union[str, Path, typing.IO[typing.AnyStr]]
# class PersonRole:
# person: Person = attr.ib()
# film: Film = attr
@attr.s(slots=True)
class Person:
name: str = attr.ib()
nconst: str = attr.ib()
@attr.s(slots=True)
class Film:
title_id: str = attr.ib()
title: str = attr.ib()
film_year: int = attr.ib(converter=int)
# content_rating: str = attr.ib(default=None)
# feature_length: int = attr.ib(default=None)
# avg_rating: float = attr.ib(default=None)
# person_role: typing.List[typing.Tuple[Person, str]] = attr.ib(factory=list)
@attr.s(slots=True)
class Nomination:
nominee: str = attr.ib()
win: bool = attr.ib()
award_agency: str = attr.ib()
film: Film = attr.ib()
def convert_yaml_to_owl(yaml_file: FilePathOrBuffer, owl_file: Path):
converter = OntogenConverter.load_from_spec(yaml_file)
# Save the results to an in-memory Ontology
onto: Ontology = converter.sync_with_ontology()
# Save the results to an RDF/XML file Ontology. Can be 'xml' or 'ttl'
onto.save_to_file(owl_file)
def load_attrs_list(file: FilePathOrBuffer,
cls: typing.Type[_T]) -> typing.List[_T]:
""" Returns a list of attrs instances read from a CSV file. """
df = pd.read_csv(file)
lst = []
for i, row in df.iterrows():
lst.append(cls(**{k: v for k, v in row.items() if k in cls.__slots__}))
return lst
def dump_attrs_list(file: FilePathOrBuffer,
attrs_lst: typing.List[Film]) -> None:
""" Dumps a list of attrs instances to a CSV file. """
attrs_dicts = []
for attrs in attrs_lst:
attrs_dict = attr.asdict(attrs)
attrs_dicts.append(attrs_dict)
df = pd.DataFrame(attrs_dicts)
df.to_csv(file, index=False)
def get_starting_films_from_awards() -> typing.List[Film]:
"""Returns a list of Film with attribs title_id, title, and file_year added."""
session = Session()
oscars = session.query(awards.Oscar).all()
baftas = session.query(awards.Bafta).all()
unique_oscar_films = set()
for oscar in oscars:
if oscar.film is None or oscar.year_film is None:
continue
unique_oscar_films.add((oscar.film, oscar.year_film))
unique_bafta_films = set()
for bafta in baftas:
if bafta.nominee is None or bafta.year is None:
continue
if bafta.workers is not None:
film_name = get_not_person_name(bafta.nominee, bafta.workers)
# try:
# film_name = get_not_person_name(bafta.nominee, bafta.workers)
# except ValueError:
# film_name = None
# while True:
# res = input(f"Choose between '{bafta.nominee}' or '{bafta.workers}' as the person (1/2/q/s): ")
# if res == 'q':
# raise
# if res == 's':
# break
# if res == '1':
# film_name = bafta.workers
# break
# if res == '2':
# film_name = bafta.nominee
# break
# if film_name is None:
# continue
else:
film_name = bafta.nominee
unique_bafta_films.add((film_name, bafta.year))
unique_award_winning_films = unique_oscar_films.union(unique_bafta_films)
film_lst: typing.List[Film] = []
title_akas_basics = session.query(imdb.TitleAkas.titleId, imdb.TitleAkas.title, imdb.TitleBasics.startYear).join(imdb.TitleBasics, imdb.TitleAkas.titleId == imdb.TitleBasics.tconst)
for film in unique_award_winning_films:
assert isinstance(film[0], str), str(film)
assert isinstance(film[1], int), str(film)
print(f"{film=}")
title_akas_basic = title_akas_basics.filter(imdb.TitleAkas.title == film[0]).filter(imdb.TitleBasics.startYear == film[1]).first()
if title_akas_basic is None:
continue
film_lst.append(Film(title_id=title_akas_basic[0], title=title_akas_basic[1], film_year=title_akas_basic[2]))
print(title_akas_basic)
return film_lst
def init_awards():
models.Award.objects.create(hasNickname='Oscars')
models.Award.objects.create(hasNickname='BAFTA')
def add_award_info():
films: typing.Iterable[models.Film] = models.Film.objects.all()
session = Session()
# oscars = session.query(awards.Oscar).all()
# baftas = session.query(awards.Bafta).all()
for film in films:
oscar = session.query(awards.Oscar)\
.filter(awards.Oscar.film == film.hasTitle)\
.filter(awards.Oscar.year_film == film.hasInitialReleaseYear).first()
if not oscar:
continue
res, name = snlp.categorize(oscar.name)
if res == 'ORG':
agent = models.Organization.upsert(hasName=name, label=name)
elif res == 'PERSON':
agent = models.Person.upsert(hasName=name)
else:
raise ValueError
award = models.Award.objects.get(hasNickname='Oscars')
award_cat = models.AwardCategory.get_instance_from_kaggle_oscar_data(oscar.category)
award_cem = models.AwardCeremony.upsert(
hasAward=award,
yearHeld=oscar.year_ceremony,
yearScreened=oscar.year_film,
hasEditionNumber=oscar.ceremony)
nom = models.NominationSituation.upsert(
forFilm=film,
hasAward=award,
hasAwardCategory=award_cat,
hasAwardCeremony=award_cem,
isGivenTo=agent,
win=oscar.winner
)
print(f'{film} {nom}')
# bafta = session.query(awards.Bafta) \
# .filter(awards.Oscar.film == film.hasTitle) \
# .filter(awards.Oscar.year_film == film.hasInitialReleaseYear).first()
def add_imdb_info():
films: typing.Iterable[models.Film] = models.Film.objects.all()
session = Session()
for film in films:
tconst = film.t_const
# add info from title_basic
title_basic: imdb.TitleBasics = session.query(imdb.TitleBasics).filter(imdb.TitleBasics.tconst == tconst).first()
# add genres
for genre in title_basic.genres.split(','):
g = models.Genre.upsert(label=genre)
film.hasGenre.add(g)
film.isAdult = bool(title_basic.isAdult)
if title_basic.isAdult == 1:
film.hasAudience = models.Audience.upsert(label='Adults')
else:
film.hasAudience = models.Audience.upsert(label='Children')
film.hasFeatureLengthInMinutes = title_basic.runtimeMinutes
# add info from title_akas
title_akas_lst: typing.Iterable[imdb.TitleAkas] = session.query(imdb.TitleAkas, imdb.ProductionCompanies, imdb.Certificates)\
.filter(imdb.TitleAkas.titleId == tconst)\
.filter(imdb.TitleAkas.isOriginalTitle == 1) \
.join(imdb.TitleBasics,
imdb.TitleBasics.tconst == imdb.TitleAkas.titleId)\
.join(imdb.Certificates,
imdb.Certificates.title == imdb.TitleAkas.title
and imdb.Certificates.year == imdb.TitleBasics.startYear)\
.join(imdb.ProductionCompanies, imdb.ProductionCompanies.title == imdb.TitleAkas.title
and imdb.ProductionCompanies.year == imdb.TitleBasics.startYear
).all()
try:
film.sync_from_wikidata()
except KeyError:
pass
for three in title_akas_lst:
title_akas = three[0]
prod = three[1]
if title_akas.isOriginalTitle == 1:
try:
if prod.country_code:
country = models.Country.objects.get(alpha_2__iexact=prod.country_code.upper())
film.hasCountryOfOrigin = country
print(f'found: {title_akas}')
except ObjectDoesNotExist:
pass
print(film)
def read_alpha_2_to_countries(csv_file: str):
import pandas
df = pandas.read_csv(csv_file)
df = df[['Country', 'Alpha-2 code', 'Alpha-3 code']]
for i in df.iterrows():
series = i[1]
models.Country.objects.create(alpha_3=series['Alpha-3 code'].replace('"', '').strip(),
alpha_2=series['Alpha-2 code'].replace('"', '').strip(),
label=series['Country'])
if __name__ == '__main__':
add_award_info()
films = get_starting_films_from_awards()
# dump_attrs_list(ROOT_DIR / 'mapping/films.csv', films)