Build an AI-powered sales research assistant that extracts live data from LinkedIn, Crunchbase, and news sources, stores it in MongoDB Atlas for semantic search, and answers complex sales questions using Haystack and Bright Data.
This cookbook walks you through building a complete lead intelligence pipeline - from scraping to RAG-powered Q&A.
import osfrom dotenv import load_dotenvload_dotenv(override=True)if not os.environ.get("GOOGLE_API_KEY") and os.environ.get("GOOGLE_AI_API_KEY"): os.environ["GOOGLE_API_KEY"] = os.environ["GOOGLE_AI_API_KEY"]# Verify all required keys are loadedrequired_keys = ["BRIGHT_DATA_API_KEY", "MONGO_CONNECTION_STRING", "GOOGLE_API_KEY"]missing_keys = [key for key in required_keys if not os.environ.get(key)]if missing_keys: raise ValueError(f"Please add {', '.join(missing_keys)} to your .env file")else: print("All environment variables loaded successfully")
from haystack_brightdata import BrightDataWebScraperdatasets = BrightDataWebScraper.get_supported_datasets()print(f"Total available datasets: {len(datasets)}\n")print("Sales research relevant datasets:")print("-" * 50)relevant_keywords = ["linkedin", "crunchbase", "company", "profile"]for dataset in datasets: if any(keyword in dataset['id'].lower() for keyword in relevant_keywords): print(f" {dataset['id']}") print(f" {dataset['description']}\n")
MongoDB Atlas serves as the vector database for storing embedded lead data and enabling semantic search.1. Create a MongoDB Atlas ClusterFollow the Get Started with Atlas guide to:
Create a free cluster (M0 tier is sufficient for testing)
Set up database access credentials
Configure network access (allow your IP or use 0.0.0.0/0 for testing)
from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasEmbeddingRetrieverretriever = MongoDBAtlasEmbeddingRetriever(document_store=document_store)
Report incorrect code
Copy
from haystack_brightdata import BrightDataWebScraper# Initialize the Web Scraper# Note: Automatically uses BRIGHT_DATA_API_KEY from environmentscraper = BrightDataWebScraper()
Extract company intelligence from Crunchbase - funding information, investors, employee count, and more.
Report incorrect code
Copy
import jsoncompany_url = "https://www.crunchbase.com/organization/openai"def coalesce(data, *keys, default="N/A"): for key in keys: value = data.get(key) if value not in (None, "", [], {}): return value return defaultdef format_industries(industries): if not industries: return "N/A" if isinstance(industries, list): values = [] for item in industries: if isinstance(item, dict): value = item.get("value") or item.get("name") or item.get("id") if value: values.append(value) else: values.append(str(item)) return ", ".join(values) if values else "N/A" return industriesdef parse_company(result): raw = result.get("data", result) if isinstance(raw, str): raw = json.loads(raw) if isinstance(raw, list): return raw[0] if raw else {} if isinstance(raw, dict): return raw return {}result = scraper.run( dataset="crunchbase_company", url=company_url)company_data = parse_company(result)industries = format_industries(company_data.get("industries"))print(f"Company: {coalesce(company_data, 'name', 'legal_name')}")print(f"Overview: {coalesce(company_data, 'about', 'company_overview')}")print(f"Industries: {industries}")print(f"Operating Status: {coalesce(company_data, 'operating_status')}")print(f"Website: {coalesce(company_data, 'website', 'url')}")print(f"Employees: {coalesce(company_data, 'num_employees', 'number_of_employee_profiles')}")
Expected output:
Report incorrect code
Copy
Company: OpenAIOverview: OpenAI is an AI research and deployment company that develops advanced AI models, including ChatGPT.Industries: Agentic AI, Artificial Intelligence (AI), Generative AI, Machine Learning, SaaSOperating Status: activeWebsite: https://www.openai.comEmployees: 1001-5000
Name: Satya NadellaPosition: Chairman and CEO at MicrosoftLocation: Redmond, Washington, United States, USCurrent Company: MicrosoftFollowers: 11816477Connections: 500Experience (5 roles): 1. Chairman and CEO at Microsoft (N/A) 2. Member Board Of Trustees at University of Chicago (N/A) 3. Board Member at Starbucks (N/A)
Test the complete indexing flow by scraping a company and indexing it into MongoDB Atlas.
Report incorrect code
Copy
from pymongo import MongoClientclient = MongoClient(os.environ.get("MONGO_CONNECTION_STRING"))db = client[document_store.database_name]if document_store.collection_name not in db.list_collection_names(): db.create_collection(document_store.collection_name) print(f"Created collection '{document_store.collection_name}'")else: print(f"Collection '{document_store.collection_name}' already exists")collection = db[document_store.collection_name]doc_count = collection.count_documents({})print(f" Current document count: {doc_count}")
Report incorrect code
Copy
# Scrape and index a company from Crunchbasecompany_url = "https://www.crunchbase.com/organization/openai"# Step 1: Scrape the companyscraper_result = scraper.run( dataset="crunchbase_company", url=company_url)# Step 2: Transform into Haystack Documentsdocuments = create_company_documents( scraper_result=scraper_result, source_url=company_url, dataset_type="crunchbase_company")print(f"Created {len(documents)} document(s)")print(f"Content (first 200 chars): {documents[0].content[:200]}...")print(f"Metadata: {documents[0].meta}")# Step 3: Generate embeddings and index into MongoDBresult = indexing_pipeline.run({"embedder": {"documents": documents}})print(f"Indexed {result['writer']['documents_written']} document(s) into MongoDB")
from haystack import Pipelinefrom haystack.components.builders import ChatPromptBuilderfrom haystack.dataclasses import ChatMessagefrom haystack_integrations.components.embedders.google_genai import GoogleGenAITextEmbedderfrom haystack_integrations.components.generators.google_genai import GoogleGenAIChatGeneratorsystem_message = ChatMessage.from_system("""You are a sales intelligence assistant. Your role is to analyze company and people data to provide actionable sales intelligence.When answering queries:- Cite specific company names and details from the data- Provide insights relevant for sales outreach- Highlight key information like funding, company size, location, recent news- Suggest talking points for personalized outreach""")user_template = """Based on the following company/person data, answer the user's question.Context:{% for document in documents %}{{ document.content }}---{% endfor %}Question: {{ question }}Provide a detailed, actionable answer based on the retrieved data."""user_message = ChatMessage.from_user(user_template)rag_pipeline = Pipeline()rag_pipeline.add_component("text_embedder", GoogleGenAITextEmbedder(model="text-embedding-004"))rag_pipeline.add_component("retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=5))rag_pipeline.add_component("prompt_builder", ChatPromptBuilder(template=[system_message, user_message]))rag_pipeline.add_component("generator", GoogleGenAIChatGenerator(model="gemini-2.5-flash"))rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")rag_pipeline.connect("retriever.documents", "prompt_builder.documents")rag_pipeline.connect("prompt_builder.prompt", "generator.messages")print("RAG pipeline created")print(" Question → Text Embedder → Retriever → Prompt Builder → Generator → Answer")
question = "What can you tell me about OpenAI? Include details about their industry, products, and any relevant information for sales outreach."result = rag_pipeline.run( data={ "text_embedder": {"text": question}, "prompt_builder": {"question": question} }, include_outputs_from={"retriever"})answer = result["generator"]["replies"][0].textprint(answer)# Show retrieved documentsif "retriever" in result: retrieved_docs = result["retriever"]["documents"] print(f"\nRetrieved {len(retrieved_docs)} relevant documents from MongoDB") for i, doc in enumerate(retrieved_docs, 1): print(f"\nDocument {i}:") print(f" Company: {doc.meta.get('company_name', 'N/A')}") print(f" Source: {doc.meta.get('dataset_type', 'N/A')}") print(f" Location: {doc.meta.get('location', 'N/A')}") print(f" Industry: {doc.meta.get('industry', 'N/A')}") print(f" Content: {doc.content[:300]}...")
The lead intelligence database uses a flexible schema that accommodates data from multiple sources while enabling powerful hybrid search capabilities.This structure enables three search modes:
Semantic Search: Find similar companies/people based on meaning
Query: “AI startups focused on enterprise automation”
Matches: Companies with similar descriptions, even if wording differs
Metadata Filtering: Exact match on structured fields
Filter: funding_stage = "Series A" AND location = "New York, NY"
Returns: Only companies meeting exact criteria
Hybrid Search: Combine both approaches
Semantic query: “Companies building developer tools”
Each document has three components: content (human-readable text for LLM context), embedding (768-dim vector from text-embedding-004 for semantic search), and meta (structured fields for filtering).Company Document (Crunchbase):
You now have an AI-powered sales research assistant! Customize the pipeline to scrape additional data sources, add more metadata filters, or adjust the RAG prompts for your specific sales workflow.