Azure · vibhansa-msft · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025 · Sep 17, 2025
@@ -0,0 +1,128 @@
+import sys
+import os
+import subprocess
+from transformers import pipeline, AutoTokenizer
+
+# The maximum sequence length for this model. This is a fixed constraint.
+MAX_SEQUENCE_LENGTH = 1024
+
+def summarize_text_with_llm(text):
+    """
+    Summarizes text using a Hugging Face model from the `transformers` library.
+    This function now handles large texts by splitting them into chunks and then
+    performing a second summarization pass on the combined results.
+    """
+    try:
+        # Load the summarization pipeline with the specified model.
+        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+
+        # Load the tokenizer for the same model to correctly handle token limits.
+        tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+
+        # Get the number of tokens in the full text.
+        token_count = len(tokenizer.encode(text, truncation=False))
+
+        # If the text is within the model's limit, summarize it directly.
+        if token_count <= MAX_SEQUENCE_LENGTH:
+            summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
+            return summary[0]['summary_text']
+        else:
+            # If the text is too long, split it into chunks, summarize, and then refine.
+            return refine_summary_from_chunks(text, summarizer, tokenizer)
+
+    except Exception as e:
+        print(f"Error summarizing with LLM: {e}", file=sys.stderr)
+        # Fallback to a basic summarization or return an error message
+        return "Could not generate AI summary. Full response below."
+
+def refine_summary_from_chunks(text, summarizer, tokenizer):
+    """
+    Splits the input text into chunks, summarizes each one, and then performs a second
+    summarization pass on the combined summaries to remove redundancy.
+    """
+    # Split the text into sentences to avoid cutting in the middle of one.
+    sentences = text.split('. ')
+    chunks = []
+    current_chunk = ""
+
+    for sentence in sentences:
+        # Check if adding the next sentence exceeds the max token limit.
+        temp_chunk = current_chunk + ('. ' if current_chunk else '') + sentence
+        if len(tokenizer.encode(temp_chunk, truncation=False)) <= MAX_SEQUENCE_LENGTH:
+            current_chunk = temp_chunk
+        else:
+            # The chunk is full, add it to the list and start a new one.
+            chunks.append(current_chunk)
+            current_chunk = sentence
+
+    # Add the last chunk if it's not empty.
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    # First pass: summarize each chunk
+    intermediate_summaries = []
+    for i, chunk in enumerate(chunks):
+        print(f"Summarizing chunk {i+1} of {len(chunks)}...", file=sys.stderr)
+        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)
+        intermediate_summaries.append(summary[0]['summary_text'])
+
+    # Second pass: combine and refine the summaries
+    combined_summary_text = " ".join(intermediate_summaries)
+    print("Refining combined summaries into a final output...", file=sys.stderr)
+
+    # The refinement summary can be shorter to focus on key points.
+    final_summary = summarizer(combined_summary_text, max_length=150, min_length=50, do_sample=False)
+
+    return final_summary[0]['summary_text']
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python ai_summarize.py <input_file_path> <output_file_path>", file=sys.stderr)
+        sys.exit(1)
+
+    input_file_path = sys.argv[1]
+    output_file_path = sys.argv[2]
+
+    # Check if the input file exists
+    if not os.path.exists(input_file_path):
+        print(f"Error: Input file not found at {input_file_path}", file=sys.stderr)
+        sys.exit(1)
+
+    with open(input_file_path, 'r', encoding='utf-8') as f:
+        full_text = f.read()
+
+    # Use the LLM to summarize the text
+    summary = summarize_text_with_llm(full_text)
+    if summary == "":
+        print("Error: Could not generate summary.")
+        sys.exit(1)
+
+    summary_len = len(summary)
+    if summary_len < 50:
+        print("Error: Generated summary is too short to be meaningful.")
+        sys.exit(1)
+
+    resp_len = len(full_text) + summary_len
+    issue_comment_body = (
+        "## Summary \n\n"
+        f"{summary}\n\n"
+    )
+
+    if resp_len < 65000:
+        full_text = full_text.replace("## ", "### ")
+        issue_comment_body += (
+            "## Details \n\n"
+            f"{full_text}\n\n"
+        )
+
+    # Add the disclaimer
+    issue_comment_body += (
+        "---\n"
+        "**In case of issue, share mount command, config file and debug-logs to investigate further.**\n\n"
+        "---\n"
+        "*Disclaimer: This summary is AI-generated and may not be fully accurate.*"
+    )
+
+    with open(output_file_path, 'w', encoding='utf-8') as f:
+        f.write(issue_comment_body)
@@ -0,0 +1,71 @@
+import asyncio
+import os
+import sys
+from contextlib import AsyncExitStack
+from typing import Optional
+import subprocess
+
+import httpx
+from mcp import ClientSession
+from mcp.client.sse import sse_client
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# The URL for the DeepWiki MCP server's SSE transport
+MCP_SSE_URL = "https://mcp.deepwiki.com/sse" # Example URL
+
+class MCPClient:
+    def __init__(self):
+        self.session: Optional[ClientSession] = None
+        self.exit_stack = AsyncExitStack()
+
+    async def connect_to_sse_server(self, server_url: str):
+        # print("Connecting to MCP SSE server...")
+        self._streams_context = sse_client(url=server_url)
+        streams = await self._streams_context.__aenter__()
+        self._session_context = ClientSession(*streams)
+        self.session: ClientSession = await self._session_context.__aenter__()
+        await self.session.initialize()
+        # print("Connected and initialized.")
+
+    async def cleanup(self):
+        if self._session_context:
+            await self._session_context.__aexit__(None, None, None)
+        if self._streams_context:
+            await self._streams_context.__aexit__(None, None, None)
+
+    async def ask_deepwiki(self, repo: str, question: str) -> str:
+        if not self.session:
+            raise RuntimeError("Client not connected.")
+
+        # The MCP SDK handles the JSON-RPC call for you
+        result = await self.session.call_tool(
+            "ask_question", {"repoName": repo, "question": question}
+        )
+
+        # Join the list of content parts into a single string
+        return result.content
+
+async def main(repo, title, body):
+    client = MCPClient()
+    try:
+        await client.connect_to_sse_server(server_url=MCP_SSE_URL)
+
+        question = f"{title}\n\n{body}"        
+        response = await client.ask_deepwiki(repo, question)
+        print (response)
+
+    finally:
+        await client.cleanup()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print("Usage: python deepwiki_query.py <repo> <question title> <question body>")
+        sys.exit(1)
+
+    repo_arg = sys.argv[1]
+    issue_title = sys.argv[2]
+    issue_body = sys.argv[3]
+
+    asyncio.run(main(repo_arg, issue_title, issue_body))
@@ -0,0 +1,78 @@
+import re
+import sys
+
+def extract_text_field(text_content: str) -> str:
+    """
+    Extracts the 'text' field from a string representation of a TextContent object.
+
+    Args:
+        text_content: The input string containing the TextContent representation.
+
+    Returns:
+        The extracted text field as a string, or an empty string if not found.
+    """
+    # The new pattern uses a greedy match `(.*)` to capture all characters,
+    # including newlines (re.DOTALL). The positive lookahead `(?=...)`
+    # ensures that the match ends exactly before the specific trailing part
+    # of the string (', annotations=None, meta=None)]'). This prevents
+    # the regex from stopping at single quotes within the text content.
+    try:
+        pattern = ""
+        if 'text="' in text_content:
+            pattern = r"text=\"(.*)\"(?=, annotations=None, meta=None\)])"
+        elif "text='" in text_content:
+            pattern = r"text='(.*)'(?=, annotations=None, meta=None\)])"
+        else:
+            print("Error: The input text does not contain a recognizable text field.")
+            sys.exit(1)
+
+        match = re.search(pattern, text_content, re.DOTALL)
+        if match:
+            decoded_text = match.group(1).encode('utf-8').decode('unicode_escape')
+
+            # Remove everything after either "## Notes\n" or "Notes:\n"
+            for delimiter in ["## Notes\n", "Notes:\n"]:
+                if delimiter in decoded_text:
+                    decoded_text = decoded_text.split(delimiter)[0]
+
+
+            # Replace all double quotes with single quotes
+            decoded_text = decoded_text.replace('"', "'")
+
+            return decoded_text.strip() 
+        else:
+            print("No match found for the text field.")
+            sys.exit(1)
+
+    except re.error as e:
+        print(f"Regular expression error: {e}")
+
+    return ""
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python extract_text.py <path_to_input_file>")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            input_text = f.read()
+
+            input_len = len(input_text)
+            if input_len < 100:   
+                print("Input text is too short to process.")
+                sys.exit(1)
+
+            extracted_content = extract_text_field(input_text)
+            if extracted_content:
+                print(extracted_content)
+            else:
+                print("Error: Could not extract text content.")
+                sys.exit(1)
+
+    except FileNotFoundError:
+        print(f"Error: The file '{file_path}' was not found.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")