/*
       *  linux/arch/i386/mm/init.c
       *
       *  Copyright (C) 1995  Linus Torvalds
       *
       *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       */
      
      #include <linux/config.h>
      #include <linux/signal.h>
      #include <linux/sched.h>
      #include <linux/kernel.h>
      #include <linux/errno.h>
      #include <linux/string.h>
      #include <linux/types.h>
      #include <linux/ptrace.h>
      #include <linux/mman.h>
      #include <linux/mm.h>
      #include <linux/swap.h>
      #include <linux/smp.h>
      #include <linux/init.h>
      #ifdef CONFIG_BLK_DEV_INITRD
      #include <linux/blk.h>
      #endif
      #include <linux/highmem.h>
      #include <linux/pagemap.h>
      #include <linux/bootmem.h>
      
      #include <asm/processor.h>
      #include <asm/system.h>
      #include <asm/uaccess.h>
      #include <asm/pgtable.h>
      #include <asm/pgalloc.h>
      #include <asm/dma.h>
      #include <asm/fixmap.h>
      #include <asm/e820.h>
      #include <asm/apic.h>
      
      unsigned long highstart_pfn, highend_pfn;
      static unsigned long totalram_pages;
      static unsigned long totalhigh_pages;
      
      /*
       * BAD_PAGE is the page that is used for page faults when linux
       * is out-of-memory. Older versions of linux just did a
       * do_exit(), but using this instead means there is less risk
       * for a process dying in kernel mode, possibly leaving an inode
       * unused etc..
       *
       * BAD_PAGETABLE is the accompanying page-table: it is initialized
       * to point to BAD_PAGE entries.
       *
       * ZERO_PAGE is a special page that is used for zero-initialized
       * data and COW.
       */
      
      /*
       * These are allocated in head.S so that we get proper page alignment.
       * If you change the size of these then change head.S as well.
       */
      extern char empty_bad_page[PAGE_SIZE];
      #if CONFIG_X86_PAE
      extern pmd_t empty_bad_pmd_table[PTRS_PER_PMD];
      #endif
      extern pte_t empty_bad_pte_table[PTRS_PER_PTE];
      
      /*
       * We init them before every return and make them writable-shared.
       * This guarantees we get out of the kernel in some more or less sane
       * way.
       */
      #if CONFIG_X86_PAE
      static pmd_t * get_bad_pmd_table(void)
      {
      	pmd_t v;
      	int i;
      
      	set_pmd(&v, __pmd(_PAGE_TABLE + __pa(empty_bad_pte_table)));
      
      	for (i = 0; i < PAGE_SIZE/sizeof(pmd_t); i++)
      		empty_bad_pmd_table[i] = v;
      
      	return empty_bad_pmd_table;
      }
      #endif
      
  87  static pte_t * get_bad_pte_table(void)
      {
      	pte_t v;
      	int i;
      
      	v = pte_mkdirty(mk_pte_phys(__pa(empty_bad_page), PAGE_SHARED));
      
  94  	for (i = 0; i < PAGE_SIZE/sizeof(pte_t); i++)
      		empty_bad_pte_table[i] = v;
      
  97  	return empty_bad_pte_table;
      }
      
      
      
 102  void __handle_bad_pmd(pmd_t *pmd)
      {
      	pmd_ERROR(*pmd);
      	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(get_bad_pte_table())));
      }
      
 108  void __handle_bad_pmd_kernel(pmd_t *pmd)
      {
      	pmd_ERROR(*pmd);
      	set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
      }
      
 114  pte_t *get_pte_kernel_slow(pmd_t *pmd, unsigned long offset)
      {
      	pte_t *pte;
      
      	pte = (pte_t *) __get_free_page(GFP_KERNEL);
 119  	if (pmd_none(*pmd)) {
 120  		if (pte) {
      			clear_page(pte);
      			set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
 123  			return pte + offset;
      		}
      		set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(get_bad_pte_table())));
 126  		return NULL;
      	}
      	free_page((unsigned long)pte);
 129  	if (pmd_bad(*pmd)) {
      		__handle_bad_pmd_kernel(pmd);
 131  		return NULL;
      	}
 133  	return (pte_t *) pmd_page(*pmd) + offset;
      }
      
 136  pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset)
      {
      	unsigned long pte;
      
      	pte = (unsigned long) __get_free_page(GFP_KERNEL);
 141  	if (pmd_none(*pmd)) {
 142  		if (pte) {
      			clear_page((void *)pte);
      			set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));
 145  			return (pte_t *)pte + offset;
      		}
      		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(get_bad_pte_table())));
 148  		return NULL;
      	}
      	free_page(pte);
 151  	if (pmd_bad(*pmd)) {
      		__handle_bad_pmd(pmd);
 153  		return NULL;
      	}
 155  	return (pte_t *) pmd_page(*pmd) + offset;
      }
      
 158  int do_check_pgt_cache(int low, int high)
      {
      	int freed = 0;
 161  	if(pgtable_cache_size > high) {
 162  		do {
 163  			if(pgd_quicklist)
      				free_pgd_slow(get_pgd_fast()), freed++;
 165  			if(pmd_quicklist)
      				free_pmd_slow(get_pmd_fast()), freed++;
 167  			if(pte_quicklist)
      				free_pte_slow(get_pte_fast()), freed++;
 169  		} while(pgtable_cache_size > low);
      	}
 171  	return freed;
      }
      
      /*
       * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
       * physical space so we can cache the place of the first one and move
       * around without checking the pgd every time.
       */
      
      #if CONFIG_HIGHMEM
      pte_t *kmap_pte;
      pgprot_t kmap_prot;
      
      #define kmap_get_fixmap_pte(vaddr)					\
      	pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
      
      void __init kmap_init(void)
      {
      	unsigned long kmap_vstart;
      
      	/* cache the first kmap pte */
      	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
      	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
      
      	kmap_prot = PAGE_KERNEL;
      }
      #endif /* CONFIG_HIGHMEM */
      
 199  void show_mem(void)
      {
      	int i, total = 0, reserved = 0;
      	int shared = 0, cached = 0;
      	int highmem = 0;
      
      	printk("Mem-info:\n");
      	show_free_areas();
      	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
      	i = max_mapnr;
 209  	while (i-- > 0) {
      		total++;
 211  		if (PageHighMem(mem_map+i))
      			highmem++;
 213  		if (PageReserved(mem_map+i))
      			reserved++;
 215  		else if (PageSwapCache(mem_map+i))
      			cached++;
 217  		else if (page_count(mem_map+i))
      			shared += page_count(mem_map+i) - 1;
      	}
      	printk("%d pages of RAM\n", total);
      	printk("%d pages of HIGHMEM\n",highmem);
      	printk("%d reserved pages\n",reserved);
      	printk("%d pages shared\n",shared);
      	printk("%d pages swap cached\n",cached);
      	printk("%ld pages in page table cache\n",pgtable_cache_size);
      	show_buffers();
      }
      
      /* References to section boundaries */
      
      extern char _text, _etext, _edata, __bss_start, _end;
      extern char __init_begin, __init_end;
      
 234  static inline void set_pte_phys (unsigned long vaddr,
      			unsigned long phys, pgprot_t flags)
      {
      	pgprot_t prot;
      	pgd_t *pgd;
      	pmd_t *pmd;
      	pte_t *pte;
      
      	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 243  	if (pgd_none(*pgd)) {
      		printk("PAE BUG #00!\n");
 245  		return;
      	}
      	pmd = pmd_offset(pgd, vaddr);
 248  	if (pmd_none(*pmd)) {
      		printk("PAE BUG #01!\n");
 250  		return;
      	}
      	pte = pte_offset(pmd, vaddr);
 253  	if (pte_val(*pte))
      		pte_ERROR(*pte);
      	pgprot_val(prot) = pgprot_val(PAGE_KERNEL) | pgprot_val(flags);
      	set_pte(pte, mk_pte_phys(phys, prot));
      
      	/*
      	 * It's enough to flush this one mapping.
      	 * (PGE mappings get flushed as well)
      	 */
      	__flush_tlb_one(vaddr);
      }
      
 265  void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
      {
      	unsigned long address = __fix_to_virt(idx);
      
 269  	if (idx >= __end_of_fixed_addresses) {
      		printk("Invalid __set_fixmap\n");
 271  		return;
      	}
      	set_pte_phys(address, phys, flags);
      }
      
 276  static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
      {
      	pgd_t *pgd;
      	pmd_t *pmd;
      	pte_t *pte;
      	int i, j;
      	unsigned long vaddr;
      
      	vaddr = start;
      	i = __pgd_offset(vaddr);
      	j = __pmd_offset(vaddr);
      	pgd = pgd_base + i;
      
 289  	for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
      #if CONFIG_X86_PAE
      		if (pgd_none(*pgd)) {
      			pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
      			set_pgd(pgd, __pgd(__pa(pmd) + 0x1));
      			if (pmd != pmd_offset(pgd, 0))
      				printk("PAE BUG #02!\n");
      		}
      		pmd = pmd_offset(pgd, vaddr);
      #else
      		pmd = (pmd_t *)pgd;
      #endif
 301  		for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) {
 302  			if (pmd_none(*pmd)) {
      				pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
      				set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
 305  				if (pte != pte_offset(pmd, 0))
 306  					BUG();
      			}
      			vaddr += PMD_SIZE;
      		}
      		j = 0;
      	}
      }
      
 314  static void __init pagetable_init (void)
      {
      	unsigned long vaddr, end;
      	pgd_t *pgd, *pgd_base;
      	int i, j, k;
      	pmd_t *pmd;
      	pte_t *pte;
      
      	/*
      	 * This can be zero as well - no problem, in that case we exit
      	 * the loops anyway due to the PTRS_PER_* conditions.
      	 */
      	end = (unsigned long)__va(max_low_pfn*PAGE_SIZE);
      
      	pgd_base = swapper_pg_dir;
      #if CONFIG_X86_PAE
      	for (i = 0; i < PTRS_PER_PGD; i++) {
      		pgd = pgd_base + i;
      		__pgd_clear(pgd);
      	}
      #endif
      	i = __pgd_offset(PAGE_OFFSET);
      	pgd = pgd_base + i;
      
 338  	for (; i < PTRS_PER_PGD; pgd++, i++) {
      		vaddr = i*PGDIR_SIZE;
 340  		if (end && (vaddr >= end))
 341  			break;
      #if CONFIG_X86_PAE
      		pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
      		set_pgd(pgd, __pgd(__pa(pmd) + 0x1));
      #else
      		pmd = (pmd_t *)pgd;
      #endif
 348  		if (pmd != pmd_offset(pgd, 0))
 349  			BUG();
 350  		for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
      			vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
 352  			if (end && (vaddr >= end))
 353  				break;
 354  			if (cpu_has_pse) {
      				unsigned long __pe;
      
      				set_in_cr4(X86_CR4_PSE);
      				boot_cpu_data.wp_works_ok = 1;
      				__pe = _KERNPG_TABLE + _PAGE_PSE + __pa(vaddr);
      				/* Make it "global" too if supported */
 361  				if (cpu_has_pge) {
      					set_in_cr4(X86_CR4_PGE);
      					__pe += _PAGE_GLOBAL;
      				}
      				set_pmd(pmd, __pmd(__pe));
 366  				continue;
      			}
      
      			pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
      			set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
      
 372  			if (pte != pte_offset(pmd, 0))
 373  				BUG();
      
 375  			for (k = 0; k < PTRS_PER_PTE; pte++, k++) {
      				vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE;
 377  				if (end && (vaddr >= end))
 378  					break;
      				*pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
      			}
      		}
      	}
      
      	/*
      	 * Fixed mappings, only the page table structure has to be
      	 * created - mappings will be set by set_fixmap():
      	 */
      	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
      	fixrange_init(vaddr, 0, pgd_base);
      
      #if CONFIG_HIGHMEM
      	/*
      	 * Permanent kmaps:
      	 */
      	vaddr = PKMAP_BASE;
      	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
      
      	pgd = swapper_pg_dir + __pgd_offset(vaddr);
      	pmd = pmd_offset(pgd, vaddr);
      	pte = pte_offset(pmd, vaddr);
      	pkmap_page_table = pte;
      #endif
      
      #if CONFIG_X86_PAE
      	/*
      	 * Add low memory identity-mappings - SMP needs it when
      	 * starting up on an AP from real-mode. In the non-PAE
      	 * case we already have these mappings through head.S.
      	 * All user-space mappings are explicitly cleared after
      	 * SMP startup.
      	 */
      	pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
      #endif
      }
      
 416  void __init zap_low_mappings (void)
      {
      	int i;
      	/*
      	 * Zap initial low-memory mappings.
      	 *
      	 * Note that "pgd_clear()" doesn't do it for
      	 * us in this case, because pgd_clear() is a
      	 * no-op in the 2-level case (pmd_clear() is
      	 * the thing that clears the page-tables in
      	 * that case).
      	 */
 428  	for (i = 0; i < USER_PTRS_PER_PGD; i++)
      #if CONFIG_X86_PAE
      		pgd_clear(swapper_pg_dir+i);
      #else
      		set_pgd(swapper_pg_dir+i, __pgd(0));
      #endif
 434  	flush_tlb_all();
      }
      
      /*
       * paging_init() sets up the page tables - note that the first 8MB are
       * already mapped by head.S.
       *
       * This routines also unmaps the page at virtual kernel address 0, so
       * that we can trap those pesky NULL-reference errors in the kernel.
       */
 444  void __init paging_init(void)
      {
      	pagetable_init();
      
      	__asm__( "movl %%ecx,%%cr3\n" ::"c"(__pa(swapper_pg_dir)));
      
      #if CONFIG_X86_PAE
      	/*
      	 * We will bail out later - printk doesnt work right now so
      	 * the user would just see a hanging kernel.
      	 */
      	if (cpu_has_pae)
      		set_in_cr4(X86_CR4_PAE);
      #endif
      
 459  	__flush_tlb_all();
      
      #ifdef CONFIG_HIGHMEM
      	kmap_init();
      #endif
      	{
      		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
      		unsigned int max_dma, high, low;
      
      		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
      		low = max_low_pfn;
      		high = highend_pfn;
      
 472  		if (low < max_dma)
      			zones_size[ZONE_DMA] = low;
 474  		else {
      			zones_size[ZONE_DMA] = max_dma;
      			zones_size[ZONE_NORMAL] = low - max_dma;
      #ifdef CONFIG_HIGHMEM
      			zones_size[ZONE_HIGHMEM] = high - low;
      #endif
      		}
      		free_area_init(zones_size);
      	}
 483  	return;
      }
      
      /*
       * Test if the WP bit works in supervisor mode. It isn't supported on 386's
       * and also on some strange 486's (NexGen etc.). All 586+'s are OK. The jumps
       * before and after the test are here to work-around some nasty CPU bugs.
       */
      
      /*
       * This function cannot be __init, since exceptions don't work in that
       * section.
       */
      static int do_test_wp_bit(unsigned long vaddr);
      
 498  void __init test_wp_bit(void)
      {
      /*
       * Ok, all PSE-capable CPUs are definitely handling the WP bit right.
       */
      	const unsigned long vaddr = PAGE_OFFSET;
      	pgd_t *pgd;
      	pmd_t *pmd;
      	pte_t *pte, old_pte;
      
      	printk("Checking if this processor honours the WP bit even in supervisor mode... ");
      
      	pgd = swapper_pg_dir + __pgd_offset(vaddr);
      	pmd = pmd_offset(pgd, vaddr);
      	pte = pte_offset(pmd, vaddr);
      	old_pte = *pte;
      	*pte = mk_pte_phys(0, PAGE_READONLY);
 515  	local_flush_tlb();
      
      	boot_cpu_data.wp_works_ok = do_test_wp_bit(vaddr);
      
      	*pte = old_pte;
 520  	local_flush_tlb();
      
 522  	if (!boot_cpu_data.wp_works_ok) {
      		printk("No.\n");
      #ifdef CONFIG_X86_WP_WORKS_OK
      		panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
      #endif
 527  	} else {
      		printk("Ok.\n");
      	}
      }
      
 532  static inline int page_is_ram (unsigned long pagenr)
      {
      	int i;
      
 536  	for (i = 0; i < e820.nr_map; i++) {
      		unsigned long addr, end;
      
 539  		if (e820.map[i].type != E820_RAM)	/* not usable memory */
 540  			continue;
      		/*
      		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
      		 *	are not. Notably the 640->1Mb area. We need a sanity
      		 *	check here.
      		 */
      		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
      		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
 548  		if  ((pagenr >= addr) && (pagenr < end))
 549  			return 1;
      	}
 551  	return 0;
      }
      
 554  void __init mem_init(void)
      {
      	int codesize, reservedpages, datasize, initsize;
      	int tmp;
      
 559  	if (!mem_map)
 560  		BUG();
      
      #ifdef CONFIG_HIGHMEM
      	highmem_start_page = mem_map + highstart_pfn;
      	max_mapnr = num_physpages = highend_pfn;
      #else
      	max_mapnr = num_physpages = max_low_pfn;
      #endif
      	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
      
      	/* clear the zero-page */
      	memset(empty_zero_page, 0, PAGE_SIZE);
      
      	/* this will put all low memory onto the freelists */
      	totalram_pages += free_all_bootmem();
      
      	reservedpages = 0;
 577  	for (tmp = 0; tmp < max_low_pfn; tmp++)
      		/*
      		 * Only count reserved RAM pages
      		 */
 581  		if (page_is_ram(tmp) && PageReserved(mem_map+tmp))
      			reservedpages++;
      #ifdef CONFIG_HIGHMEM
      	for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
      		struct page *page = mem_map + tmp;
      
      		if (!page_is_ram(tmp)) {
      			SetPageReserved(page);
      			continue;
      		}
      		ClearPageReserved(page);
      		set_bit(PG_highmem, &page->flags);
      		atomic_set(&page->count, 1);
      		__free_page(page);
      		totalhigh_pages++;
      	}
      	totalram_pages += totalhigh_pages;
      #endif
      	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
      	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
      	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
      
      	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
      		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
      		max_mapnr << (PAGE_SHIFT-10),
      		codesize >> 10,
      		reservedpages << (PAGE_SHIFT-10),
      		datasize >> 10,
      		initsize >> 10,
      		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
      	       );
      
      #if CONFIG_X86_PAE
      	if (!cpu_has_pae)
      		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
      #endif
 617  	if (boot_cpu_data.wp_works_ok < 0)
      		test_wp_bit();
      
      	/*
      	 * Subtle. SMP is doing it's boot stuff late (because it has to
      	 * fork idle threads) - but it also needs low mappings for the
      	 * protected-mode entry to work. We zap these entries only after
      	 * the WP-bit has been tested.
      	 */
      #ifndef CONFIG_SMP
      	zap_low_mappings();
      #endif
      
      }
      
      /* Put this after the callers, so that it cannot be inlined */
 633  static int do_test_wp_bit(unsigned long vaddr)
      {
      	char tmp_reg;
      	int flag;
      
      	__asm__ __volatile__(
      		"	movb %0,%1	\n"
      		"1:	movb %1,%0	\n"
      		"	xorl %2,%2	\n"
      		"2:			\n"
      		".section __ex_table,\"a\"\n"
      		"	.align 4	\n"
      		"	.long 1b,2b	\n"
      		".previous		\n"
      		:"=m" (*(char *) vaddr),
      		 "=q" (tmp_reg),
      		 "=r" (flag)
      		:"2" (1)
      		:"memory");
      	
 653  	return flag;
      }
      
 656  void free_initmem(void)
      {
      	unsigned long addr;
      
      	addr = (unsigned long)(&__init_begin);
 661  	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
      		ClearPageReserved(virt_to_page(addr));
      		set_page_count(virt_to_page(addr), 1);
      		free_page(addr);
      		totalram_pages++;
      	}
      	printk ("Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10);
      }
      
      #ifdef CONFIG_BLK_DEV_INITRD
      void free_initrd_mem(unsigned long start, unsigned long end)
      {
      	if (start < end)
      		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
      	for (; start < end; start += PAGE_SIZE) {
      		ClearPageReserved(virt_to_page(start));
      		set_page_count(virt_to_page(start), 1);
      		free_page(start);
      		totalram_pages++;
      	}
      }
      #endif
      
 684  void si_meminfo(struct sysinfo *val)
      {
      	val->totalram = totalram_pages;
      	val->sharedram = 0;
      	val->freeram = nr_free_pages();
      	val->bufferram = atomic_read(&buffermem_pages);
      	val->totalhigh = totalhigh_pages;
      	val->freehigh = nr_free_highpages();
      	val->mem_unit = PAGE_SIZE;
 693  	return;
      }