Spaces:
Running
on
T4
Running
on
T4
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +4 -1
auditqa/doc_process.py
CHANGED
|
@@ -41,13 +41,16 @@ def process_pdf():
|
|
| 41 |
doc_processed = text_splitter.split_documents(value)
|
| 42 |
for doc in doc_processed:
|
| 43 |
doc.metadata["source"] = file
|
|
|
|
| 44 |
all_documents[file] = doc_processed
|
| 45 |
|
| 46 |
print(all_documents.keys())
|
| 47 |
|
| 48 |
|
| 49 |
embeddings = HuggingFaceEmbeddings(
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
)
|
| 52 |
|
| 53 |
qdrant_collections = {}
|
|
|
|
| 41 |
doc_processed = text_splitter.split_documents(value)
|
| 42 |
for doc in doc_processed:
|
| 43 |
doc.metadata["source"] = file
|
| 44 |
+
doc.metadata["year"] = file[-4:]
|
| 45 |
all_documents[file] = doc_processed
|
| 46 |
|
| 47 |
print(all_documents.keys())
|
| 48 |
|
| 49 |
|
| 50 |
embeddings = HuggingFaceEmbeddings(
|
| 51 |
+
model_kwargs = {'device': 'cpu'},
|
| 52 |
+
encode_kwargs = {'normalize_embeddings': True},
|
| 53 |
+
model_name="BAAI/bge-small-en-v1.5"
|
| 54 |
)
|
| 55 |
|
| 56 |
qdrant_collections = {}
|