File size: 14,561 Bytes
7042c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47bbae2
 
 
 
7042c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575256f
7042c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575256f
7042c3c
 
 
 
 
575256f
7042c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575256f
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
"""Shared utility functions used in the project.

Functions:

"""

import os
import datetime
import getpass
import base64
import json
import re
from dotenv import load_dotenv, find_dotenv
import markdown
from markdownify import markdownify as md2text
from io import BytesIO
import pandas as pd

from pydantic import BaseModel, SecretStr

from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain_core.prompts import PromptTemplate

#Model & Index & Embeddings
from langchain_openai import (
    ChatOpenAI,
)

from langchain_core.output_parsers import (
    PydanticOutputParser, 
)

from langchain_core.messages import (
    AIMessage,
    HumanMessage,
    get_buffer_string,
)

from PIL import Image

from ea4all.src.shared.prompts import (
    LLAMA31_CHAT_PROMPT_FORMAT,
)

from ea4all.src.shared.configuration import BaseConfiguration as ea4all_config

############
##INIT model
############
#initialise model / constants
EA4ALL_ARCHITECTURE = "ea4all_architecture.png"
EA4ALL_PODCAST = "ea4all_podcast.wav"

class CFG:
    # Constants
    EA4ALL_OVERVIEW = "ea4all_overview.png"
    EA4ALL_ABOUT = "ea4all_overview.txt"
    APM_MOCK_QNA = "apm_qna_mock.txt"
    STREAM_SLEEP = 0.05
    REGEX_BACKTICKS = r"```(.*?)```"

    # LLMs
    #model = {"gpt-4":'gpt-4o-mini', "gpt-4o":'gpt-4o'}
    #llama = {"11": "meta-llama/llama-3.2-11B-Vision-Instruct", "90":"meta-llama/llama-3.2-90B-Vision-Instruct", "70":"meta-llama/Llama-3.1-70B-Instruct", "73":"meta-llama/Llama-3.3-70B-Instruct"}
    
    #hf_model="meta-llama/Llama-3.1-70B-Instruct"
    #hf_api_base="https://api-inference.huggingface.co/models/"

    #hf_max_tokens=16192
    #max_new_tokens = 4096
    #llama32_max_tokens = 4096 ##TOKEN ISSUE LLAMA-3.2 w/ ChatOpenAI not working tokens > 4096 2024-10-13

    #temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15    

    # splitting
    split_chunk_size = 500
    split_overlap = 0
    
    # embeddings
    #embeddings_model = OpenAIEmbeddings()

    # similar passages
    k = 3

    #debug
    verbose=True

    #streamming
    #streamming=True

    #VQA resized images - maximum resolution for Llama-3.2
    RESIZE_TO = 512
    MAX_WIDTH = 1024
    MAX_HEIGHT = 768

    ##Diagrams format
    diagram_format = "png"

    # paths ea4all/src/tools
    #apm_store = "/Users/avfranco/Documents/GitHub/ea4all-agentic-staging/ea4all/apm_store/"
    #apm_path = apm_store + 'APM-ea4all (test-split).xlsx'
    #dbr_demo = apm_store + "reference_architecture_dbr_demo.txt"

    #'ea4all_images = "/Users/avfranco/Documents/GitHub/ea4all-agentic-staging/ea4all/images/"

    #apm_faiss = "apm_store"
    #faiss_index =  'apm_faiss_index'

###################################
##COLLECTION of re-usable functions
###################################

#return current date-time
def _get_datetime():
    now = datetime.datetime.now()
    return now.strftime("%m/%d/%Y, %H:%M:%S")

def _get_formatted_date():
    current_date = datetime.datetime.now()
    formatted_date = current_date.strftime("%d %B %Y")

    return formatted_date

#calculate dif end-start execution
def time_elapsed(start,end):
    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'{time_elapsed}'

    return time_elapsed_str

def _join_paths(*paths):
    """
    Join two or more paths using os.path.join.

    Parameters:
    *paths: str
        Two or more path components to be joined.

    Returns:
    str
        The joined path.
    """
    return os.path.join(*paths)

#get user request info
def get_user_identification(request):
    if request:
        try:
            user_pip = request.headers.get('X-Forwarded-For')
            return user_pip.split(',')[0]
        except Exception:
            print(f"user info: {request}")
            return request.client.host
        
    return "ea4all_agent"

#Initialise model
## SETUP LLM CLIENT
def get_llm_client(model, api_base_url=None,temperature=0, streaming=False, tokens=ea4all_config.max_tokens) -> ChatOpenAI:
    """Initializes and returns a ChatOpenAI client based on the specified model and parameters."""
    client = ChatOpenAI()

    if model.startswith("gpt-"):
       client = ChatOpenAI(
            model=model,
            temperature=temperature,
            streaming=streaming,
            max_completion_tokens=tokens,
            stream_usage=True
        )        
    elif "llama" in model.lower():  # Meta-llama models
        client = ChatOpenAI(
            model=model,
            api_key=SecretStr(os.environ['HUGGINGFACEHUB_API_TOKEN']),
            base_url=_join_paths(api_base_url, model, "v1/"),
            temperature=temperature,
            streaming=streaming,
            max_completion_tokens=tokens,
            stream_usage=True,
        )
    
    return client

#load local env variables
def load_local_env(local):
    ###read local .env file
    _ = load_dotenv(find_dotenv()) 
    if local not in os.environ:
        os.environ[local] = getpass.getpass(f"Provide your {local} Key")
    return os.environ[local]

#locad landscape data into chroma
def load_to_chroma(documents, embeddings, path, collection_name="apm_collection"):
    #Read chromadb chroma-apm-db
    chroma_collection = Chroma (
        collection_name=collection_name,
        persist_directory=path,
        embedding_function=embeddings
        )  

    if chroma_collection._collection.count():
        chroma_collection.delete_collection()
    else:
        #Add apm records
        chroma_collection = Chroma.from_documents(
            collection_name=collection_name,
            persist_directory=path,
            documents=documents,
            embedding=embeddings
        )
        chroma_collection.persist()
    
    return chroma_collection

##Convert gradio chat_history to langchain chat_history_format
def get_history_gradio(history, chat_history=[]):
    history_langchain_format = []
    #triggered by loaded memory runnable to replace ConversationMemoryBuffer.load_memory_variables
    #if chat_history or not history:
    #    memory = chat_history
    #triggered by loaded_memory runnable
    #else:
    history = history["chat_memory"]
    
    for human, ai in history:
        history_langchain_format.append(HumanMessage(content=human))
        history_langchain_format.append(AIMessage(content=ai))
 
    history = {"history":get_buffer_string(history_langchain_format)}

    return history

#retrieve relevant questions based on user interaction
def get_vqa_examples() -> list:
    examples=[
        {"text": "Describe this image.", "files": ["ea4all/images/multi-app-architecture.png"]},
        {"text": "Assess any risk and vulnerabilities in the current solution.", "files": ["ea4all/images/ea4all_architecture.png"]},
    ]
    return examples

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def resize_image_1(raw_image, input_size):
    w, h = raw_image.size
    scale = input_size / max(w, h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    resized_image = raw_image.resize((new_w, new_h))

    return resized_image

def resize_image_2(image, width):
    wpercent = width / float(image.size[0])
    hsize = int( float(image.size[1]) * wpercent )
    raw_image = image.resize([width, hsize])

    return raw_image

def resize_image_3(image):
    from PIL import Image
    # Get the current size
    width, height = image.size

    # Calculate the new size maintaining the aspect ratio
    if width > CFG.MAX_WIDTH or height > CFG.MAX_HEIGHT:
        ratio = min(CFG.MAX_WIDTH / width, CFG.MAX_HEIGHT / height)
        new_width = int(width * ratio)
        new_height = int(height * ratio)
    else:
        new_width, new_height = width, height
    
    # Resize the image
    image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
    
    # Return new resized image
    return image

#Encode PIL.Image to base64
def encode_raw_image(raw_image):
    # Create a BytesIO buffer
    buffer = BytesIO()

    # Save the image to the buffer in PNG format
    raw_image.save(buffer, format='PNG')

    # Get the content of the buffer
    img_bytes = buffer.getvalue()

    # Encode the bytes to base64
    img_base64 = base64.b64encode(img_bytes)

    # Convert the bytes to string
    img_str = img_base64.decode('utf-8')

    return img_str

#Return a raw image ready to OpenAI GPT4-Vision
def get_raw_image(image_path):
    # Open & Resize & Encode image
    diagram = Image.open(image_path)
    w, h = diagram.size
    if w > CFG.RESIZE_TO or h > CFG.RESIZE_TO:
        resized_image = resize_image_3(diagram)
    else:
        resized_image = diagram

    #Encode diagram
    raw_image = encode_raw_image(resized_image)

    return raw_image

def load_mock_content(file_path) -> str:
    try:
        with open(_join_paths(ea4all_config.ea4all_store,file_path), "r") as file:
            content = file.read()
            return content
    except ValueError as e:
        return str(e)
    
def print_json_to_md(data, indent=0, column=None):
    try:
        result = ""
        header = ""
        body = ""
        if isinstance(data, dict):
            for key, value in data.items():
                result  +=  print_json_to_md(value, indent + 2, key)
            return result
        elif isinstance(data, list):
            if column:  # Print list items as a Markdown table
                header = ' ' * indent + f"| {' | '.join(data[0].keys())} | \n"
                header += ' ' * indent + f"| {' | '.join(['---'] * len(data[0]))} | \n"
                for item in data:
                    body += ' ' * indent + f"\n\n | {' | '.join(str(item[k]) for k in item.keys())} |"
                result += header + body
                return result
            else:
                for item in data:
                    header = ' ' * indent + f"| {' | '.join(data[0].keys())} |"
                    body += ' ' * indent + f"\n\n | {' | '.join(str(item[k]) for k in item.keys())} |"
                result += header + "\n" + body
                return result
        else:
            header +=  ' ' * indent + f"| {column} "
            body += f"{str(data)}\n\n"
            result += header + body
            return result

    except Exception as e:
        return f"{e} - {data}"

def markdown_to_plain_text(md):
    # Convert Markdown to HTML
    html = markdown.markdown(md)
    # Convert HTML to plain text using markdownify
    plain_text = md2text(html)
    return plain_text

def extract_structured_output(response):
    ##EXTRACT Topic from the content
    try:
        return json.loads(response)
    except ValueError:
        match = re.search(CFG.REGEX_BACKTICKS, response, re.DOTALL)
        
        if match:
            return json.loads(match.group(1))
        else:
            return None

def get_predicted_num_tokens(llm, content):
    return llm.get_num_tokens(content)

def get_predicted_num_tokens_from_prompt(llm, prompt, values):
    final_prompt = prompt.format(**values)
    return llm.get_num_tokens(final_prompt)

def set_max_new_tokens(predicted_tokens):
    #Return max new tokens to be generated
    return int((ea4all_config.max_tokens - predicted_tokens) * 0.95)

def escape_special_characters(input_string):
    # Use json.dumps to escape special characters
    escaped_string = json.dumps(input_string)
    # Remove the surrounding double quotes added by json.dumps
    return escaped_string[1:-1]

def clean_and_load_json(content) -> dict:
    try:
        json_data = json.loads(content)
        return json_data
    except ValueError:
        clean_string = content.replace("\n","").replace("json","")
        json_data = json.loads(clean_string)
        return json_data

def extract_response_from_backticks(response):
    pattern = r"```(.*?)```"
    match = re.search(pattern, str(response), re.DOTALL)

    return match.group(1) if match else response  

def extract_topic_from_business_input(response) -> dict:
    ##IS JSON already
    if isinstance(response, dict):
        return response
    
    ##EXTRACT Topic from the content
    topic = extract_response_from_backticks(response)
    
    return clean_and_load_json(topic)

## LLM STRUCTURED OUTPUT Helper functions
def extract_landscape(topic):
    # Prompt
    extract_landscape_prompt = hub.pull('learn-it-all-do-it-all/ea4all_togaf_landscape_business_query')

    # Set up a parser: LandscapeAsIs
    parser = PydanticOutputParser(pydantic_object=topic) 

    final_prompt = extract_landscape_prompt.partial(
        format_instructions=parser.get_format_instructions(),
        ai_output=LLAMA31_CHAT_PROMPT_FORMAT,
    )

    return final_prompt

def extract_principles(topic):

    # Set up a parser: LandscapeAsIs
    parser = PydanticOutputParser(pydantic_object=topic) 

    #PROMPT REVISED TO WORK w/ Llama-3
    principle_template = """Identify the list of principles and its meaning from the given context.
    Do not add any superfluous information.
    Context: \n {strategic_principles} \n
    Output your answer as JSON that matches the given schema and nothing else: \n{format_instructions}\n
    """

    prompt = PromptTemplate(
        template=principle_template,
        input_variables=["strategic_principles"],
        partial_variables={
            "format_instructions": parser.get_format_instructions(),
            },
    )

    return prompt

# Task-1: Identify the business requirements, objectives, user journey, and all other relevant information
def extract_detailed_business_requirements(llm, topic: type[BaseModel], name:str, values:dict):
    parser = PydanticOutputParser(pydantic_object=topic)
    
    hub_prompt = hub.pull('learn-it-all-do-it-all/ea4all_extract_business_topic')
    hub_prompt = hub_prompt.partial(
        topic=name,
        format_instructions=parser.get_format_instructions(),
    )
    
    task_1_requirement = hub_prompt | llm | parser
    response = task_1_requirement.invoke(
        input=values, 
        config={
            'tags': ['assess_business_query'],
            'run_name': name  # Custom run name
        }
    )

    return response

# Post-processing
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

#load mock data
def get_relevant_questions(source: str) -> list:
    relevant_questions = []
    mock = load_mock_content(source)
    for line in mock.splitlines(): relevant_questions += [line]

    return relevant_questions